{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "47660991-7ad9-42f8-8fa0-5fffed7f245a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "import torch.nn.functional as F\n",
    "from sklearn.manifold import TSNE\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import scipy.stats as stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "67c479a6-8dbf-448b-a07e-7685dea6f7a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = \"../Dataset/SMT_Dataset/preprocessed_human_smt_dataset.csv\"\n",
    "df = pd.read_csv(data_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "41a2e46b-43d8-4baf-8d3a-2e281ab5a2eb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['participant_id', 'session_no', 'task_type', 'trial_no', 'day', 'block',\n",
       "       'start_point_x', 'start_point_y', 'target_point_x', 'target_point_y',\n",
       "       'start_time', 'end_time', 'quadrant', 'is_success', 'actual_dist',\n",
       "       'movement_dist', 'completion_time', 'path', 'time_string',\n",
       "       'time_diff_ms', 'Age', 'Cohort', 'Gestational_Age',\n",
       "       'mabc_total_test_score', 'mabc_standard_score', 'mabc_percentile',\n",
       "       'distances', 'rmsd', 'normalized_trajectory', 'movement_speed',\n",
       "       'accuracy'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['movement_speed'] = df['movement_dist'] / df['completion_time']\n",
    "df['accuracy'] = 1/df['rmsd']\n",
    "df = df.replace([np.inf, -np.inf], np.nan)\n",
    "df = df.dropna()\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d9f8c09a-1d75-424a-a3ce-9e8fdc589373",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>participant_id</th>\n",
       "      <th>task_type</th>\n",
       "      <th>Age</th>\n",
       "      <th>Cohort</th>\n",
       "      <th>Gestational_Age</th>\n",
       "      <th>mabc_percentile</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>Preterm</td>\n",
       "      <td>33.0</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>Preterm</td>\n",
       "      <td>33.0</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>Preterm</td>\n",
       "      <td>33.0</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>Preterm</td>\n",
       "      <td>33.0</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>Preterm</td>\n",
       "      <td>33.0</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16315</th>\n",
       "      <td>67</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>Term</td>\n",
       "      <td>40.0</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16316</th>\n",
       "      <td>67</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>Term</td>\n",
       "      <td>40.0</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16317</th>\n",
       "      <td>67</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>Term</td>\n",
       "      <td>40.0</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16318</th>\n",
       "      <td>67</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>Term</td>\n",
       "      <td>40.0</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16319</th>\n",
       "      <td>67</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>Term</td>\n",
       "      <td>40.0</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>16320 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       participant_id  task_type  Age   Cohort  Gestational_Age  \\\n",
       "0                   0          0    5  Preterm             33.0   \n",
       "1                   0          0    5  Preterm             33.0   \n",
       "2                   0          0    5  Preterm             33.0   \n",
       "3                   0          0    5  Preterm             33.0   \n",
       "4                   0          0    5  Preterm             33.0   \n",
       "...               ...        ...  ...      ...              ...   \n",
       "16315              67          1    8     Term             40.0   \n",
       "16316              67          1    8     Term             40.0   \n",
       "16317              67          1    8     Term             40.0   \n",
       "16318              67          1    8     Term             40.0   \n",
       "16319              67          1    8     Term             40.0   \n",
       "\n",
       "       mabc_percentile  \n",
       "0                 50.0  \n",
       "1                 50.0  \n",
       "2                 50.0  \n",
       "3                 50.0  \n",
       "4                 50.0  \n",
       "...                ...  \n",
       "16315              0.5  \n",
       "16316              0.5  \n",
       "16317              0.5  \n",
       "16318              0.5  \n",
       "16319              0.5  \n",
       "\n",
       "[16320 rows x 6 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_char = df[['participant_id', 'task_type', 'Age', 'Cohort', 'Gestational_Age', 'mabc_percentile']]\n",
    "df_char"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "bc778fee-8b7b-4f00-ae5b-393ce3fd18d6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(4319, 37) (4319, 37)\n",
      "T-test results comparing MABC percentile between randomly selected Term and Preterm cohorts:\n",
      "Term (n=4319): 39.91 [95% CI: 39.16 to 40.67]\n",
      "Preterm (n=4319): 23.61 [95% CI: 22.74 to 24.49]\n",
      "Mean difference: 16.30\n",
      "t-statistic: 27.591\n",
      "p-value: 0.0000\n",
      "\n",
      "Selected Term participant IDs: [58, 8, 2, 19, 17, 16, 10, 7, 45, 6, 55, 37, 3, 65, 67, 15, 62, 43]\n",
      "Selected Preterm participant IDs: [0, 30, 49, 22, 34, 36, 23, 53, 21, 52, 29, 47, 9, 51, 35, 39, 50, 46]\n"
     ]
    }
   ],
   "source": [
    "from scipy import stats\n",
    "import random\n",
    "term_participant_ids = df[df['Cohort'] == 'Term']['participant_id'].unique()\n",
    "preterm_participant_ids = df[df['Cohort'] == 'Preterm']['participant_id'].unique()\n",
    "\n",
    "# Randomly select 15 participants from each cohort\n",
    "# If there are fewer than 15 participants in either group, take all available participants\n",
    "random.seed(42)  # For reproducibility\n",
    "selected_term_ids = random.sample(list(term_participant_ids), min(18, len(term_participant_ids)))\n",
    "selected_preterm_ids = random.sample(list(preterm_participant_ids), min(18, len(preterm_participant_ids)))\n",
    "\n",
    "# Filter the dataframe for these participants\n",
    "term_subset = df[df['participant_id'].isin(selected_term_ids)]\n",
    "preterm_subset = df[df['participant_id'].isin(selected_preterm_ids)]\n",
    "print(term_subset.shape, preterm_subset.shape)\n",
    "\n",
    "# Get the mabc_percentile values for the selected participants\n",
    "term_data = term_subset['mabc_percentile'].values\n",
    "preterm_data = preterm_subset['mabc_percentile'].values\n",
    "\n",
    "# Calculate means\n",
    "term_mean = np.mean(term_data)\n",
    "preterm_mean = np.mean(preterm_data)\n",
    "\n",
    "# Calculate 95% confidence intervals\n",
    "def get_ci(data, confidence=0.95):\n",
    "    n = len(data)\n",
    "    m = np.mean(data)\n",
    "    se = stats.sem(data)\n",
    "    h = se * stats.t.ppf((1 + confidence) / 2, n-1)\n",
    "    return m, m-h, m+h  # mean, lower bound, upper bound\n",
    "\n",
    "term_mean, term_lower, term_upper = get_ci(term_data)\n",
    "preterm_mean, preterm_lower, preterm_upper = get_ci(preterm_data)\n",
    "\n",
    "# Perform independent t-test (using Welch's t-test for unequal variances)\n",
    "t_stat, p_value = stats.ttest_ind(term_data, preterm_data, equal_var=False)\n",
    "\n",
    "# Print results\n",
    "print(\"T-test results comparing MABC percentile between randomly selected Term and Preterm cohorts:\")\n",
    "print(f\"Term (n={len(term_data)}): {term_mean:.2f} [95% CI: {term_lower:.2f} to {term_upper:.2f}]\")\n",
    "print(f\"Preterm (n={len(preterm_data)}): {preterm_mean:.2f} [95% CI: {preterm_lower:.2f} to {preterm_upper:.2f}]\")\n",
    "print(f\"Mean difference: {term_mean - preterm_mean:.2f}\")\n",
    "print(f\"t-statistic: {t_stat:.3f}\")\n",
    "print(f\"p-value: {p_value:.4f}\")\n",
    "\n",
    "# Check which specific participant_ids were selected\n",
    "print(\"\\nSelected Term participant IDs:\", selected_term_ids)\n",
    "print(\"Selected Preterm participant IDs:\", selected_preterm_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "4e2e3973-e1dd-41c2-aba1-9f56f57aecd7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:302: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['age_group'] = pd.cut(df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:315: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['task_type_label'] = df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:360: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['age_group'] = pd.cut(df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:375: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['task_type_label'] = df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:302: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['age_group'] = pd.cut(df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:315: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['task_type_label'] = df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:302: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['age_group'] = pd.cut(df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:315: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['task_type_label'] = df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:302: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['age_group'] = pd.cut(df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:315: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['task_type_label'] = df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:100: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  cohort_df['age_group'] = pd.cut(cohort_df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:107: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  cohort_df['task_type_label'] = cohort_df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:100: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  cohort_df['age_group'] = pd.cut(cohort_df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:107: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  cohort_df['task_type_label'] = cohort_df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:151: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_unique['age_group'] = pd.cut(df_unique['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:170: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_unique['task_type_label'] = df_unique['task_type'].map(task_map)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                        Characteristic          Value\n",
      "0               Number of participants          16320\n",
      "1               Age (years), mean ± SD    6.69 ± 1.06\n",
      "2                            Age range      5.0 - 8.0\n",
      "3                       Age group: 5-6   3120 (19.1%)\n",
      "4                       Age group: 6-7   3120 (19.1%)\n",
      "5                       Age group: 7-8   5760 (35.3%)\n",
      "6                         Cohort: Term  12000 (73.5%)\n",
      "7                      Cohort: Preterm   4320 (26.5%)\n",
      "8                 Task type: Unimanual   8160 (50.0%)\n",
      "9                  Task type: Bimanual   8160 (50.0%)\n",
      "10  Gestational Age (weeks), mean ± SD   37.06 ± 3.93\n",
      "11               Gestational Age range    23.0 - 42.0\n",
      "12          MABC percentile, mean ± SD  39.10 ± 30.63\n",
      "13               MABC percentile range     0.5 - 95.0\n",
      "{'overall': {'n_participants': 68, 'age': {'mean': 6.6911764705882355, 'std': 1.068648929984729, 'median': 7.0, 'min': 5, 'max': 8}, 'age_group': {'7-8': 24, '5-6': 13, '6-7': 13}, 'age_group_percent': {'7-8': 48.0, '5-6': 26.0, '6-7': 26.0}, 'cohort': {'Term': 50, 'Preterm': 18}, 'cohort_percent': {'Term': 73.52941176470588, 'Preterm': 26.47058823529412}, 'task_type': {'Unimanual': 68}, 'task_type_percent': {'Unimanual': 100.0}, 'gestational_age': {'mean': 37.05882352941177, 'std': 3.954526418204072, 'median': 39.0, 'min': 23.0, 'max': 42.0}, 'mabc_percentile': {'mean': 39.095588235294116, 'std': 30.856100457927447, 'median': 37.0, 'min': 0.5, 'max': 95.0}}, 'by_cohort': {'Preterm': {'n_participants': 18, 'age': {'mean': 6.333333333333333, 'std': 1.0289915108550531, 'median': 7.0, 'min': 5, 'max': 8}, 'age_group': {'7-8': 10, '5-6': 6, '6-7': 1}, 'age_group_percent': {'7-8': 58.82352941176471, '5-6': 35.294117647058826, '6-7': 5.88235294117647}, 'cohort': {'Preterm': 18}, 'cohort_percent': {'Preterm': 100.0}, 'task_type': {'Unimanual': 18}, 'task_type_percent': {'Unimanual': 100.0}, 'gestational_age': {'mean': 31.38888888888889, 'std': 3.269986308460074, 'median': 32.5, 'min': 23.0, 'max': 34.0}, 'mabc_percentile': {'mean': 23.61111111111111, 'std': 30.19571237364063, 'median': 7.0, 'min': 1.0, 'max': 84.0}}, 'Term': {'n_participants': 50, 'age': {'mean': 6.82, 'std': 1.063110568865089, 'median': 7.0, 'min': 5, 'max': 8}, 'age_group': {'7-8': 14, '6-7': 12, '5-6': 7}, 'age_group_percent': {'7-8': 42.42424242424242, '6-7': 36.36363636363637, '5-6': 21.21212121212121}, 'cohort': {'Term': 50}, 'cohort_percent': {'Term': 100.0}, 'task_type': {'Unimanual': 50}, 'task_type_percent': {'Unimanual': 100.0}, 'gestational_age': {'mean': 39.1, 'std': 1.2697420596165128, 'median': 39.0, 'min': 37.0, 'max': 42.0}, 'mabc_percentile': {'mean': 44.67, 'std': 29.42330223285133, 'median': 37.0, 'min': 0.5, 'max': 95.0}}}}\n",
      "                       Characteristic        Overall        Preterm  \\\n",
      "0              Number of participants             68             18   \n",
      "1              Age (years), mean ± SD    6.69 ± 1.07    6.33 ± 1.03   \n",
      "2                           Age range      5.0 - 8.0      5.0 - 8.0   \n",
      "3                      Age group: 5-6     13 (19.1%)      6 (33.3%)   \n",
      "4                      Age group: 6-7     13 (19.1%)       1 (5.6%)   \n",
      "5                      Age group: 7-8     24 (35.3%)     10 (55.6%)   \n",
      "6                Task type: Unimanual    68 (100.0%)    18 (100.0%)   \n",
      "7                 Task type: Bimanual       0 (0.0%)       0 (0.0%)   \n",
      "8  Gestational Age (weeks), mean ± SD   37.06 ± 3.95   31.39 ± 3.27   \n",
      "9          MABC percentile, mean ± SD  39.10 ± 30.86  23.61 ± 30.20   \n",
      "\n",
      "            Term  \n",
      "0             50  \n",
      "1    6.82 ± 1.06  \n",
      "2      5.0 - 8.0  \n",
      "3      7 (14.0%)  \n",
      "4     12 (24.0%)  \n",
      "5     14 (28.0%)  \n",
      "6    50 (100.0%)  \n",
      "7       0 (0.0%)  \n",
      "8   39.10 ± 1.27  \n",
      "9  44.67 ± 29.42  \n",
      "Participant Characteristics:\n",
      "A total of 68 unique participants were included in the analysis. Participants were distributed across cohorts as follows: Preterm: 18 (26.5%), Term: 50 (73.5%). \n",
      "\n",
      "Age characteristics by cohort:\n",
      "- Preterm: mean age 6.33 years (SD = 1.03, range = 5.0-8.0). Age groups: 5-6 years: 6 (35.3%), 6-7 years: 1 (5.9%), 7-8 years: 10 (58.8%).\n",
      "- Term: mean age 6.82 years (SD = 1.06, range = 5.0-8.0). Age groups: 5-6 years: 7 (21.2%), 6-7 years: 12 (36.4%), 7-8 years: 14 (42.4%).\n",
      "\n",
      "Task type distribution by cohort:\n",
      "- Preterm: Unimanual: 18 (100.0%).\n",
      "- Term: Unimanual: 50 (100.0%).\n",
      "\n",
      "Gestational age characteristics by cohort:\n",
      "- Preterm: mean gestational age 31.39 weeks (SD = 3.27, range = 23.0-34.0).\n",
      "- Term: mean gestational age 39.10 weeks (SD = 1.27, range = 37.0-42.0).\n",
      "\n",
      "MABC percentile characteristics by cohort:\n",
      "- Preterm: mean MABC percentile 23.61 (SD = 30.20, range = 1.0-84.0).\n",
      "- Term: mean MABC percentile 44.67 (SD = 29.42, range = 0.5-95.0).\n",
      "\n",
      "Total rows: 120\n",
      "Unique participants: 100\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:302: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['age_group'] = pd.cut(df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:315: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['task_type_label'] = df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:302: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['age_group'] = pd.cut(df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:315: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['task_type_label'] = df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:302: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['age_group'] = pd.cut(df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:315: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['task_type_label'] = df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:302: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['age_group'] = pd.cut(df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:315: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['task_type_label'] = df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:302: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['age_group'] = pd.cut(df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:315: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['task_type_label'] = df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:302: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['age_group'] = pd.cut(df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:315: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['task_type_label'] = df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:100: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  cohort_df['age_group'] = pd.cut(cohort_df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:107: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  cohort_df['task_type_label'] = cohort_df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:100: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  cohort_df['age_group'] = pd.cut(cohort_df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:107: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  cohort_df['task_type_label'] = cohort_df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:151: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_unique['age_group'] = pd.cut(df_unique['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:170: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_unique['task_type_label'] = df_unique['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:302: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['age_group'] = pd.cut(df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:315: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['task_type_label'] = df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:302: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['age_group'] = pd.cut(df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:315: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['task_type_label'] = df['task_type'].map(task_map)\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:302: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['age_group'] = pd.cut(df['Age'],\n",
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/2907653457.py:315: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['task_type_label'] = df['task_type'].map(task_map)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Comparison Table:\n",
      "                       Characteristic        Overall        Control  \\\n",
      "0              Number of participants            100             51   \n",
      "1              Age (years), mean ± SD    6.41 ± 0.89    6.43 ± 0.87   \n",
      "2                           Age range      5.0 - 8.0      5.1 - 8.0   \n",
      "3                      Age group: 5-6     41 (41.0%)     20 (39.2%)   \n",
      "4                      Age group: 6-7     28 (28.0%)     14 (27.5%)   \n",
      "5                      Age group: 7-8     31 (31.0%)     17 (33.3%)   \n",
      "6                Task type: Unimanual     60 (60.0%)     31 (60.8%)   \n",
      "7                 Task type: Bimanual     40 (40.0%)     20 (39.2%)   \n",
      "8  Gestational Age (weeks), mean ± SD   39.23 ± 1.97   38.95 ± 1.81   \n",
      "9          MABC percentile, mean ± SD  44.53 ± 28.35  39.02 ± 30.22   \n",
      "\n",
      "    Experimental  \n",
      "0             49  \n",
      "1    6.39 ± 0.92  \n",
      "2      5.0 - 7.9  \n",
      "3     21 (42.9%)  \n",
      "4     14 (28.6%)  \n",
      "5     14 (28.6%)  \n",
      "6     29 (59.2%)  \n",
      "7     20 (40.8%)  \n",
      "8   39.53 ± 2.10  \n",
      "9  50.25 ± 25.31  \n",
      "\n",
      "Formatted for Research Paper:\n",
      "Participant Characteristics:\n",
      "A total of 100 unique participants were included in the analysis. Participants were distributed across cohorts as follows: Control: 51 (51.0%), Experimental: 49 (49.0%). \n",
      "\n",
      "Age characteristics by cohort:\n",
      "- Control: mean age 6.43 years (SD = 0.87, range = 5.1-8.0). Age groups: 5-6 years: 20 (39.2%), 6-7 years: 14 (27.5%), 7-8 years: 17 (33.3%).\n",
      "- Experimental: mean age 6.39 years (SD = 0.92, range = 5.0-7.9). Age groups: 5-6 years: 21 (42.9%), 6-7 years: 14 (28.6%), 7-8 years: 14 (28.6%).\n",
      "\n",
      "Task type distribution by cohort:\n",
      "- Control: Unimanual: 31 (60.8%), Bimanual: 20 (39.2%).\n",
      "- Experimental: Unimanual: 29 (59.2%), Bimanual: 20 (40.8%).\n",
      "\n",
      "Gestational age characteristics by cohort:\n",
      "- Control: mean gestational age 38.95 weeks (SD = 1.81, range = 35.8-43.4).\n",
      "- Experimental: mean gestational age 39.53 weeks (SD = 2.10, range = 35.9-46.7).\n",
      "\n",
      "MABC percentile characteristics by cohort:\n",
      "- Control: mean MABC percentile 39.02 (SD = 30.22, range = 2.8-95.2).\n",
      "- Experimental: mean MABC percentile 50.25 (SD = 25.31, range = 2.4-96.6).\n",
      "\n"
     ]
    }
   ],
   "source": [
    "def format_cohort_comparison_for_paper(df):\n",
    "    \"\"\"\n",
    "    Format a cohort comparison of participant characteristics for a research paper.\n",
    "    Only uses unique participants (drops duplicates).\n",
    "    \n",
    "    Parameters:\n",
    "    df (pandas.DataFrame): DataFrame containing participant data\n",
    "    \n",
    "    Returns:\n",
    "    str: Formatted text for research paper comparing cohorts\n",
    "    \"\"\"\n",
    "    # Ensure we only use unique participants\n",
    "    df_unique = df.drop_duplicates(subset=['participant_id'])\n",
    "    \n",
    "    # Get unique cohorts\n",
    "    cohorts = df_unique['Cohort'].unique()\n",
    "    \n",
    "    # Calculate statistics by cohort\n",
    "    results = calculate_participant_characteristics_by_cohort(df)\n",
    "    \n",
    "    # Format overall statistics\n",
    "    text = \"Participant Characteristics:\\n\"\n",
    "    text += f\"A total of {results['overall']['n_participants']} unique participants were included in the analysis. \"\n",
    "    \n",
    "    # Add cohort distribution\n",
    "    text += \"Participants were distributed across cohorts as follows: \"\n",
    "    cohort_text = []\n",
    "    for cohort in cohorts:\n",
    "        cohort_count = len(df_unique[df_unique['Cohort'] == cohort])\n",
    "        cohort_percent = cohort_count / len(df_unique) * 100\n",
    "        cohort_text.append(f\"{cohort}: {cohort_count} ({cohort_percent:.1f}%)\")\n",
    "    text += \", \".join(cohort_text) + \". \"\n",
    "    \n",
    "    # Compare age across cohorts\n",
    "    text += \"\\n\\nAge characteristics by cohort:\\n\"\n",
    "    for cohort in cohorts:\n",
    "        cohort_stats = results['by_cohort'][cohort]\n",
    "        text += f\"- {cohort}: mean age {cohort_stats['age']['mean']:.2f} years \"\n",
    "        text += f\"(SD = {cohort_stats['age']['std']:.2f}, \"\n",
    "        text += f\"range = {cohort_stats['age']['min']:.1f}-{cohort_stats['age']['max']:.1f}). \"\n",
    "        \n",
    "        # Add age group distribution for this cohort\n",
    "        age_group_text = []\n",
    "        for age_group, count in sorted(cohort_stats['age_group'].items()):\n",
    "            percent = cohort_stats['age_group_percent'][age_group]\n",
    "            age_group_text.append(f\"{age_group} years: {count} ({percent:.1f}%)\")\n",
    "        text += f\"Age groups: {', '.join(age_group_text)}.\\n\"\n",
    "    \n",
    "    # Compare task types across cohorts\n",
    "    text += \"\\nTask type distribution by cohort:\\n\"\n",
    "    for cohort in cohorts:\n",
    "        cohort_stats = results['by_cohort'][cohort]\n",
    "        task_text = []\n",
    "        for task, count in cohort_stats['task_type'].items():\n",
    "            percent = cohort_stats['task_type_percent'][task]\n",
    "            task_text.append(f\"{task}: {count} ({percent:.1f}%)\")\n",
    "        text += f\"- {cohort}: {', '.join(task_text)}.\\n\"\n",
    "    \n",
    "    # Compare gestational age across cohorts\n",
    "    text += \"\\nGestational age characteristics by cohort:\\n\"\n",
    "    for cohort in cohorts:\n",
    "        cohort_stats = results['by_cohort'][cohort]\n",
    "        text += f\"- {cohort}: mean gestational age {cohort_stats['gestational_age']['mean']:.2f} weeks \"\n",
    "        text += f\"(SD = {cohort_stats['gestational_age']['std']:.2f}, \"\n",
    "        text += f\"range = {cohort_stats['gestational_age']['min']:.1f}-{cohort_stats['gestational_age']['max']:.1f}).\\n\"\n",
    "    \n",
    "    # Compare MABC percentile across cohorts\n",
    "    text += \"\\nMABC percentile characteristics by cohort:\\n\"\n",
    "    for cohort in cohorts:\n",
    "        cohort_stats = results['by_cohort'][cohort]\n",
    "        text += f\"- {cohort}: mean MABC percentile {cohort_stats['mabc_percentile']['mean']:.2f} \"\n",
    "        text += f\"(SD = {cohort_stats['mabc_percentile']['std']:.2f}, \"\n",
    "        text += f\"range = {cohort_stats['mabc_percentile']['min']:.1f}-{cohort_stats['mabc_percentile']['max']:.1f}).\\n\"\n",
    "    \n",
    "    return text\n",
    "\n",
    "def create_cohort_comparison_table(df):\n",
    "    \"\"\"\n",
    "    Create a summary table comparing participant characteristics across cohorts.\n",
    "    Only uses unique participants (drops duplicates).\n",
    "    \n",
    "    Parameters:\n",
    "    df (pandas.DataFrame): DataFrame containing participant data\n",
    "    \n",
    "    Returns:\n",
    "    pandas.DataFrame: Summary table with statistics by cohort\n",
    "    \"\"\"\n",
    "    # Ensure we only use unique participants\n",
    "    df_unique = df.drop_duplicates(subset=['participant_id'])\n",
    "    \n",
    "    # Get unique cohorts\n",
    "    cohorts = df_unique['Cohort'].unique()\n",
    "    \n",
    "    # Calculate statistics for each cohort\n",
    "    cohort_stats = {}\n",
    "    for cohort in cohorts:\n",
    "        cohort_df = df_unique[df_unique['Cohort'] == cohort]\n",
    "        \n",
    "        # Create age groups for this cohort\n",
    "        cohort_df['age_group'] = pd.cut(cohort_df['Age'], \n",
    "                                       bins=[5, 6, 7, 8], \n",
    "                                       labels=['5-6', '6-7', '7-8'], \n",
    "                                       right=False)\n",
    "        \n",
    "        # Map task type\n",
    "        task_map = {0: 'Unimanual', 1: 'Bimanual'}\n",
    "        cohort_df['task_type_label'] = cohort_df['task_type'].map(task_map)\n",
    "        \n",
    "        # Store stats for this cohort\n",
    "        cohort_stats[cohort] = {\n",
    "            'n': len(cohort_df),\n",
    "            'age_mean': cohort_df['Age'].mean(),\n",
    "            'age_std': cohort_df['Age'].std(),\n",
    "            'age_min': cohort_df['Age'].min(),\n",
    "            'age_max': cohort_df['Age'].max(),\n",
    "            'age_groups': cohort_df['age_group'].value_counts().to_dict(),\n",
    "            'task_types': cohort_df['task_type_label'].value_counts().to_dict(),\n",
    "            'gestational_age_mean': cohort_df['Gestational_Age'].mean(),\n",
    "            'gestational_age_std': cohort_df['Gestational_Age'].std(),\n",
    "            'mabc_mean': cohort_df['mabc_percentile'].mean(),\n",
    "            'mabc_std': cohort_df['mabc_percentile'].std()\n",
    "        }\n",
    "    \n",
    "    # Create comparison table\n",
    "    # First, prepare the columns: Characteristic, Overall, Cohort1, Cohort2, ...\n",
    "    columns = ['Characteristic', 'Overall'] + list(cohorts)\n",
    "    comparison = pd.DataFrame(columns=columns)\n",
    "    \n",
    "    # Add rows for each characteristic\n",
    "    # Number of participants\n",
    "    row = ['Number of participants', len(df_unique)]\n",
    "    for cohort in cohorts:\n",
    "        row.append(cohort_stats[cohort]['n'])\n",
    "    comparison.loc[len(comparison)] = row\n",
    "    \n",
    "    # Age\n",
    "    row = ['Age (years), mean ± SD', \n",
    "           f\"{df_unique['Age'].mean():.2f} ± {df_unique['Age'].std():.2f}\"]\n",
    "    for cohort in cohorts:\n",
    "        row.append(f\"{cohort_stats[cohort]['age_mean']:.2f} ± {cohort_stats[cohort]['age_std']:.2f}\")\n",
    "    comparison.loc[len(comparison)] = row\n",
    "    \n",
    "    # Age range\n",
    "    row = ['Age range', \n",
    "           f\"{df_unique['Age'].min():.1f} - {df_unique['Age'].max():.1f}\"]\n",
    "    for cohort in cohorts:\n",
    "        row.append(f\"{cohort_stats[cohort]['age_min']:.1f} - {cohort_stats[cohort]['age_max']:.1f}\")\n",
    "    comparison.loc[len(comparison)] = row\n",
    "    \n",
    "    # Age groups\n",
    "    df_unique['age_group'] = pd.cut(df_unique['Age'], \n",
    "                                  bins=[5, 6, 7, 8], \n",
    "                                  labels=['5-6', '6-7', '7-8'], \n",
    "                                  right=False)\n",
    "    for age_group in ['5-6', '6-7', '7-8']:\n",
    "        overall_count = df_unique['age_group'].value_counts().get(age_group, 0)\n",
    "        overall_percent = overall_count / len(df_unique) * 100 if len(df_unique) > 0 else 0\n",
    "        \n",
    "        row = [f'Age group: {age_group}', f\"{overall_count} ({overall_percent:.1f}%)\"]\n",
    "        \n",
    "        for cohort in cohorts:\n",
    "            cohort_count = cohort_stats[cohort]['age_groups'].get(age_group, 0)\n",
    "            cohort_percent = cohort_count / cohort_stats[cohort]['n'] * 100 if cohort_stats[cohort]['n'] > 0 else 0\n",
    "            row.append(f\"{cohort_count} ({cohort_percent:.1f}%)\")\n",
    "        \n",
    "        comparison.loc[len(comparison)] = row\n",
    "    \n",
    "    # Task types\n",
    "    task_map = {0: 'Unimanual', 1: 'Bimanual'}\n",
    "    df_unique['task_type_label'] = df_unique['task_type'].map(task_map)\n",
    "    for task in ['Unimanual', 'Bimanual']:\n",
    "        overall_count = df_unique['task_type_label'].value_counts().get(task, 0)\n",
    "        overall_percent = overall_count / len(df_unique) * 100 if len(df_unique) > 0 else 0\n",
    "        \n",
    "        row = [f'Task type: {task}', f\"{overall_count} ({overall_percent:.1f}%)\"]\n",
    "        \n",
    "        for cohort in cohorts:\n",
    "            cohort_count = cohort_stats[cohort]['task_types'].get(task, 0)\n",
    "            cohort_percent = cohort_count / cohort_stats[cohort]['n'] * 100 if cohort_stats[cohort]['n'] > 0 else 0\n",
    "            row.append(f\"{cohort_count} ({cohort_percent:.1f}%)\")\n",
    "        \n",
    "        comparison.loc[len(comparison)] = row\n",
    "    \n",
    "    # Gestational Age\n",
    "    row = ['Gestational Age (weeks), mean ± SD', \n",
    "           f\"{df_unique['Gestational_Age'].mean():.2f} ± {df_unique['Gestational_Age'].std():.2f}\"]\n",
    "    for cohort in cohorts:\n",
    "        row.append(f\"{cohort_stats[cohort]['gestational_age_mean']:.2f} ± {cohort_stats[cohort]['gestational_age_std']:.2f}\")\n",
    "    comparison.loc[len(comparison)] = row\n",
    "    \n",
    "    # MABC percentile\n",
    "    row = ['MABC percentile, mean ± SD', \n",
    "           f\"{df_unique['mabc_percentile'].mean():.2f} ± {df_unique['mabc_percentile'].std():.2f}\"]\n",
    "    for cohort in cohorts:\n",
    "        row.append(f\"{cohort_stats[cohort]['mabc_mean']:.2f} ± {cohort_stats[cohort]['mabc_std']:.2f}\")\n",
    "    comparison.loc[len(comparison)] = row\n",
    "    \n",
    "    return comparison\n",
    "\n",
    "def calculate_participant_characteristics_by_cohort(df):\n",
    "    \"\"\"\n",
    "    Calculate descriptive statistics for participant characteristics grouped by cohort.\n",
    "    Only uses unique participants (drops duplicates).\n",
    "    \n",
    "    Parameters:\n",
    "    df (pandas.DataFrame): DataFrame containing participant data with columns:\n",
    "        'participant_id', 'Age', 'Cohort', 'Gestational_Age', 'mabc_percentile', 'task_type'\n",
    "    \n",
    "    Returns:\n",
    "    dict: Dictionary containing statistics for each measure by cohort\n",
    "    \"\"\"\n",
    "    # Ensure we only use unique participants (drop duplicates based on participant_id)\n",
    "    df_unique = df.drop_duplicates(subset=['participant_id'])\n",
    "    \n",
    "    # Get unique cohorts\n",
    "    cohorts = df_unique['Cohort'].unique()\n",
    "    \n",
    "    # Initialize results dictionary\n",
    "    results = {\n",
    "        'overall': {},\n",
    "        'by_cohort': {}\n",
    "    }\n",
    "    \n",
    "    # Overall statistics (with unique participants)\n",
    "    results['overall'] = calculate_participant_characteristics(df_unique)\n",
    "    \n",
    "    # Calculate statistics for each cohort\n",
    "    for cohort in cohorts:\n",
    "        cohort_df = df_unique[df_unique['Cohort'] == cohort]\n",
    "        results['by_cohort'][cohort] = calculate_participant_characteristics(cohort_df)\n",
    "    \n",
    "    return results\n",
    "\n",
    "\n",
    "# Assuming df_char is your dataframe with the participant characteristics\n",
    "# If you don't have this dataframe already, you would load it from your data source\n",
    "# For example: df_char = pd.read_csv('your_data.csv')\n",
    "\n",
    "# Example of creating a sample dataframe for testing purposes\n",
    "def create_sample_dataframe(n=100):\n",
    "    \"\"\"\n",
    "    Create a sample dataframe with participant characteristics for testing.\n",
    "    \n",
    "    Parameters:\n",
    "    n (int): Number of participants to generate\n",
    "    \n",
    "    Returns:\n",
    "    pandas.DataFrame: Sample dataframe with participant characteristics\n",
    "    \"\"\"\n",
    "    np.random.seed(42)  # For reproducibility\n",
    "    \n",
    "    # Generate random data\n",
    "    participant_ids = [f\"P{i:03d}\" for i in range(1, n+1)]\n",
    "    ages = np.random.uniform(5.0, 8.0, n)  # Ages between 5 and 8 years\n",
    "    cohorts = np.random.choice(['Control', 'Experimental'], n)\n",
    "    gestational_ages = np.random.normal(39, 2, n)  # Normal distribution around 39 weeks\n",
    "    mabc_percentiles = np.random.uniform(1, 99, n)  # Percentiles from 1 to 99\n",
    "    task_types = np.random.choice([0, 1], n)  # 0=Unimanual, 1=Bimanual\n",
    "    \n",
    "    # Create dataframe\n",
    "    df = pd.DataFrame({\n",
    "        'participant_id': participant_ids,\n",
    "        'Age': ages,\n",
    "        'Cohort': cohorts,\n",
    "        'Gestational_Age': gestational_ages,\n",
    "        'mabc_percentile': mabc_percentiles,\n",
    "        'task_type': task_types\n",
    "    })\n",
    "    \n",
    "    return df\n",
    "\n",
    "# Uncomment to create a sample dataframe for testing\n",
    "# df_char = create_sample_dataframe(100)\n",
    "\n",
    "def calculate_participant_characteristics(df):\n",
    "    \"\"\"\n",
    "    Calculate descriptive statistics for participant characteristics.\n",
    "    \n",
    "    Parameters:\n",
    "    df (pandas.DataFrame): DataFrame containing participant data with columns:\n",
    "        'participant_id', 'Age', 'Cohort', 'Gestational_Age', 'mabc_percentile', 'task_type'\n",
    "    \n",
    "    Returns:\n",
    "    dict: Dictionary containing statistics for each measure\n",
    "    \"\"\"\n",
    "    stats = {}\n",
    "    \n",
    "    # Number of participants\n",
    "    stats['n_participants'] = len(df)\n",
    "    \n",
    "    # Age statistics\n",
    "    stats['age'] = {\n",
    "        'mean': df['Age'].mean(),\n",
    "        'std': df['Age'].std(),\n",
    "        'median': df['Age'].median(),\n",
    "        'min': df['Age'].min(),\n",
    "        'max': df['Age'].max()\n",
    "    }\n",
    "    \n",
    "    # Age group distribution\n",
    "    # Create age groups: 5-6, 6-7, 7-8\n",
    "    df['age_group'] = pd.cut(df['Age'], \n",
    "                            bins=[5, 6, 7, 8], \n",
    "                            labels=['5-6', '6-7', '7-8'], \n",
    "                            right=False)\n",
    "    stats['age_group'] = df['age_group'].value_counts().to_dict()\n",
    "    stats['age_group_percent'] = (df['age_group'].value_counts(normalize=True) * 100).to_dict()\n",
    "    \n",
    "    # Cohort distribution\n",
    "    stats['cohort'] = df['Cohort'].value_counts().to_dict()\n",
    "    stats['cohort_percent'] = (df['Cohort'].value_counts(normalize=True) * 100).to_dict()\n",
    "    \n",
    "    # Task type distribution (0=Unimanual, 1=Bimanual)\n",
    "    task_map = {0: 'Unimanual', 1: 'Bimanual'}\n",
    "    df['task_type_label'] = df['task_type'].map(task_map)\n",
    "    stats['task_type'] = df['task_type_label'].value_counts().to_dict()\n",
    "    stats['task_type_percent'] = (df['task_type_label'].value_counts(normalize=True) * 100).to_dict()\n",
    "    \n",
    "    # Gestational Age statistics\n",
    "    stats['gestational_age'] = {\n",
    "        'mean': df['Gestational_Age'].mean(),\n",
    "        'std': df['Gestational_Age'].std(),\n",
    "        'median': df['Gestational_Age'].median(),\n",
    "        'min': df['Gestational_Age'].min(),\n",
    "        'max': df['Gestational_Age'].max()\n",
    "    }\n",
    "    \n",
    "    # MABC percentile statistics\n",
    "    stats['mabc_percentile'] = {\n",
    "        'mean': df['mabc_percentile'].mean(),\n",
    "        'std': df['mabc_percentile'].std(),\n",
    "        'median': df['mabc_percentile'].median(),\n",
    "        'min': df['mabc_percentile'].min(),\n",
    "        'max': df['mabc_percentile'].max()\n",
    "    }\n",
    "    \n",
    "    return stats\n",
    "\n",
    "def create_summary_table(df):\n",
    "    \"\"\"\n",
    "    Create a summary table for participant characteristics.\n",
    "    \n",
    "    Parameters:\n",
    "    df (pandas.DataFrame): DataFrame containing participant data\n",
    "    \n",
    "    Returns:\n",
    "    pandas.DataFrame: Summary table with statistics\n",
    "    \"\"\"\n",
    "    # Create a summary dataframe\n",
    "    summary = pd.DataFrame(columns=['Characteristic', 'Value'])\n",
    "    \n",
    "    # Add number of participants\n",
    "    summary.loc[len(summary)] = ['Number of participants', len(df)]\n",
    "    \n",
    "    # Add age statistics\n",
    "    summary.loc[len(summary)] = ['Age (years), mean ± SD', f\"{df['Age'].mean():.2f} ± {df['Age'].std():.2f}\"]\n",
    "    summary.loc[len(summary)] = ['Age range', f\"{df['Age'].min():.1f} - {df['Age'].max():.1f}\"]\n",
    "    \n",
    "    # Add age group distribution\n",
    "    df['age_group'] = pd.cut(df['Age'], \n",
    "                           bins=[5, 6, 7, 8], \n",
    "                           labels=['5-6', '6-7', '7-8'], \n",
    "                           right=False)\n",
    "    for age_group, count in df['age_group'].value_counts().sort_index().items():\n",
    "        percent = count / len(df) * 100\n",
    "        summary.loc[len(summary)] = [f'Age group: {age_group}', f\"{count} ({percent:.1f}%)\"]\n",
    "    \n",
    "    # Add cohort distribution\n",
    "    for cohort, count in df['Cohort'].value_counts().items():\n",
    "        percent = count / len(df) * 100\n",
    "        summary.loc[len(summary)] = [f'Cohort: {cohort}', f\"{count} ({percent:.1f}%)\"]\n",
    "    \n",
    "    # Add task type distribution\n",
    "    task_map = {0: 'Unimanual', 1: 'Bimanual'}\n",
    "    df['task_type_label'] = df['task_type'].map(task_map)\n",
    "    for task, count in df['task_type_label'].value_counts().items():\n",
    "        percent = count / len(df) * 100\n",
    "        summary.loc[len(summary)] = [f'Task type: {task}', f\"{count} ({percent:.1f}%)\"]\n",
    "    \n",
    "    # Add gestational age statistics\n",
    "    summary.loc[len(summary)] = ['Gestational Age (weeks), mean ± SD', \n",
    "                                f\"{df['Gestational_Age'].mean():.2f} ± {df['Gestational_Age'].std():.2f}\"]\n",
    "    summary.loc[len(summary)] = ['Gestational Age range', \n",
    "                               f\"{df['Gestational_Age'].min():.1f} - {df['Gestational_Age'].max():.1f}\"]\n",
    "    \n",
    "    # Add MABC percentile statistics\n",
    "    summary.loc[len(summary)] = ['MABC percentile, mean ± SD', \n",
    "                               f\"{df['mabc_percentile'].mean():.2f} ± {df['mabc_percentile'].std():.2f}\"]\n",
    "    summary.loc[len(summary)] = ['MABC percentile range', \n",
    "                              f\"{df['mabc_percentile'].min():.1f} - {df['mabc_percentile'].max():.1f}\"]\n",
    "    \n",
    "    return summary\n",
    "\n",
    "# To use the functions:\n",
    "stats = calculate_participant_characteristics(df_char)\n",
    "summary_table = create_summary_table(df_char)\n",
    "print(summary_table)\n",
    "\n",
    "# Example of how to format the statistics for a research paper\n",
    "def format_for_research_paper(df):\n",
    "    \"\"\"\n",
    "    Format participant characteristics statistics for a research paper.\n",
    "    \n",
    "    Parameters:\n",
    "    df (pandas.DataFrame): DataFrame containing participant data\n",
    "    \n",
    "    Returns:\n",
    "    str: Formatted text for research paper\n",
    "    \"\"\"\n",
    "    stats = calculate_participant_characteristics(df)\n",
    "    \n",
    "    text = \"Participant Characteristics:\\n\"\n",
    "    text += f\"A total of {stats['n_participants']} participants were included in the analysis. \"\n",
    "    text += f\"The mean age was {stats['age']['mean']:.2f} years (SD = {stats['age']['std']:.2f}, \"\n",
    "    text += f\"range = {stats['age']['min']:.1f}-{stats['age']['max']:.1f}). \"\n",
    "    \n",
    "    # Format age group information\n",
    "    age_group_text = []\n",
    "    for age_group, count in sorted(stats['age_group'].items()):\n",
    "        percent = stats['age_group_percent'][age_group]\n",
    "        age_group_text.append(f\"{age_group} years: {count} ({percent:.1f}%)\")\n",
    "    \n",
    "    text += f\"Participants were distributed across age groups as follows: {', '.join(age_group_text)}. \"\n",
    "    \n",
    "    # Format cohort information\n",
    "    cohort_text = []\n",
    "    for cohort, count in stats['cohort'].items():\n",
    "        percent = stats['cohort_percent'][cohort]\n",
    "        cohort_text.append(f\"{cohort}: {count} ({percent:.1f}%)\")\n",
    "    \n",
    "    text += f\"Participants were distributed across cohorts as follows: {', '.join(cohort_text)}. \"\n",
    "    \n",
    "    # Format task type information\n",
    "    task_text = []\n",
    "    for task, count in stats['task_type'].items():\n",
    "        percent = stats['task_type_percent'][task]\n",
    "        task_text.append(f\"{task}: {count} ({percent:.1f}%)\")\n",
    "    \n",
    "    text += f\"Tasks were distributed as follows: {', '.join(task_text)}. \"\n",
    "    \n",
    "    text += f\"The mean gestational age was {stats['gestational_age']['mean']:.2f} weeks \"\n",
    "    text += f\"(SD = {stats['gestational_age']['std']:.2f}, \"\n",
    "    text += f\"range = {stats['gestational_age']['min']:.1f}-{stats['gestational_age']['max']:.1f}). \"\n",
    "    \n",
    "    text += f\"The mean MABC percentile was {stats['mabc_percentile']['mean']:.2f} \"\n",
    "    text += f\"(SD = {stats['mabc_percentile']['std']:.2f}, \"\n",
    "    text += f\"range = {stats['mabc_percentile']['min']:.1f}-{stats['mabc_percentile']['max']:.1f}).\"\n",
    "    \n",
    "    return text\n",
    "\n",
    "# Example usage with the cohort comparison functions:\n",
    "# By cohort statistics\n",
    "stats_by_cohort = calculate_participant_characteristics_by_cohort(df_char)\n",
    "print(stats_by_cohort)\n",
    "\n",
    "# Cohort comparison table\n",
    "comparison_table = create_cohort_comparison_table(df_char)\n",
    "print(comparison_table)\n",
    "\n",
    "# Cohort comparison for research paper\n",
    "cohort_paper_text = format_cohort_comparison_for_paper(df_char)\n",
    "print(cohort_paper_text)\n",
    "\n",
    "# Example of how to use the functions with sample data:\n",
    "# Sample usage demonstration\n",
    "# if __name__ == \"__main__\":\n",
    "    # Create sample data\n",
    "df_sample = create_sample_dataframe(100)\n",
    "\n",
    "# Add some duplicate participant IDs to demonstrate deduplication\n",
    "duplicates = df_sample.sample(20).copy()\n",
    "duplicates['task_type'] = 1 - duplicates['task_type']  # Flip task type for duplicates\n",
    "df_with_duplicates = pd.concat([df_sample, duplicates])\n",
    "\n",
    "print(f\"Total rows: {len(df_with_duplicates)}\")\n",
    "print(f\"Unique participants: {df_with_duplicates['participant_id'].nunique()}\")\n",
    "\n",
    "# Calculate statistics with unique rows only\n",
    "stats_by_cohort = calculate_participant_characteristics_by_cohort(df_with_duplicates)\n",
    "\n",
    "# Create comparison table\n",
    "comparison_table = create_cohort_comparison_table(df_with_duplicates)\n",
    "\n",
    "# Format for research paper\n",
    "cohort_paper_text = format_cohort_comparison_for_paper(df_with_duplicates)\n",
    "\n",
    "print(\"\\nComparison Table:\")\n",
    "print(comparison_table)\n",
    "\n",
    "print(\"\\nFormatted for Research Paper:\")\n",
    "print(cohort_paper_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "6e8ceeb5-5ecd-4fe9-8ecc-f17a190c3cb7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>participant_id</th>\n",
       "      <th>Age</th>\n",
       "      <th>Cohort</th>\n",
       "      <th>Gestational_Age</th>\n",
       "      <th>mabc_percentile</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>Preterm</td>\n",
       "      <td>33.0</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>Preterm</td>\n",
       "      <td>33.0</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>Preterm</td>\n",
       "      <td>33.0</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>Preterm</td>\n",
       "      <td>33.0</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>Preterm</td>\n",
       "      <td>33.0</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16315</th>\n",
       "      <td>67</td>\n",
       "      <td>8</td>\n",
       "      <td>Term</td>\n",
       "      <td>40.0</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16316</th>\n",
       "      <td>67</td>\n",
       "      <td>8</td>\n",
       "      <td>Term</td>\n",
       "      <td>40.0</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16317</th>\n",
       "      <td>67</td>\n",
       "      <td>8</td>\n",
       "      <td>Term</td>\n",
       "      <td>40.0</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16318</th>\n",
       "      <td>67</td>\n",
       "      <td>8</td>\n",
       "      <td>Term</td>\n",
       "      <td>40.0</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16319</th>\n",
       "      <td>67</td>\n",
       "      <td>8</td>\n",
       "      <td>Term</td>\n",
       "      <td>40.0</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>16320 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       participant_id  Age   Cohort  Gestational_Age  mabc_percentile\n",
       "0                   0    5  Preterm             33.0             50.0\n",
       "1                   0    5  Preterm             33.0             50.0\n",
       "2                   0    5  Preterm             33.0             50.0\n",
       "3                   0    5  Preterm             33.0             50.0\n",
       "4                   0    5  Preterm             33.0             50.0\n",
       "...               ...  ...      ...              ...              ...\n",
       "16315              67    8     Term             40.0              0.5\n",
       "16316              67    8     Term             40.0              0.5\n",
       "16317              67    8     Term             40.0              0.5\n",
       "16318              67    8     Term             40.0              0.5\n",
       "16319              67    8     Term             40.0              0.5\n",
       "\n",
       "[16320 rows x 5 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_char"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "60bd1e9d-c418-469e-9f9d-f2342994f619",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "               Preterm         Term      Overall\n",
      "age_group                                       \n",
      "5-6          6 (33.3%)    7 (14.0%)   13 (19.1%)\n",
      "6-7           1 (5.6%)   12 (24.0%)   13 (19.1%)\n",
      "7-8         11 (61.1%)   31 (62.0%)   42 (61.8%)\n",
      "Total      18 (100.0%)  50 (100.0%)  68 (100.0%)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/c5/gxvjzbzd36x1q68jgw7v4zl80000gn/T/ipykernel_84879/3227765276.py:17: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_unique['age_group'] = pd.cut(df_unique['Age'],\n"
     ]
    }
   ],
   "source": [
    "def age_group_distribution(df):\n",
    "    \"\"\"\n",
    "    A simple standalone function that calculates the count and percentage \n",
    "    of participants in each age group (5-6, 6-7, 7-8), grouped by cohort.\n",
    "    Only counts unique participants.\n",
    "    \n",
    "    Parameters:\n",
    "    df (pandas.DataFrame): DataFrame with 'participant_id', 'Age', and 'Cohort' columns\n",
    "    \n",
    "    Returns:\n",
    "    pandas.DataFrame: Formatted table showing age group distribution by cohort\n",
    "    \"\"\"\n",
    "    # Ensure we only use unique participants\n",
    "    df_unique = df.drop_duplicates(subset=['participant_id'])\n",
    "    \n",
    "    # Create age groups\n",
    "    df_unique['age_group'] = pd.cut(df_unique['Age'], \n",
    "                                  bins=[0, 6, 7, 10], \n",
    "                                  labels=['5-6', '6-7', '7-8'], \n",
    "                                  right=False)\n",
    "    \n",
    "    # Count participants in each age group by cohort\n",
    "    counts = pd.crosstab(df_unique['age_group'], df_unique['Cohort'])\n",
    "    \n",
    "    # Calculate percentages within each cohort\n",
    "    percentages = counts.copy()\n",
    "    for col in percentages.columns:\n",
    "        percentages[col] = (percentages[col] / percentages[col].sum() * 100).round(1)\n",
    "    \n",
    "    # Create the formatted result\n",
    "    result = pd.DataFrame(index=counts.index)\n",
    "    \n",
    "    # Format as count (percentage%)\n",
    "    for cohort in counts.columns:\n",
    "        result[cohort] = [f\"{counts.loc[age, cohort]} ({percentages.loc[age, cohort]}%)\" \n",
    "                      for age in counts.index]\n",
    "    \n",
    "    # Add an Overall column\n",
    "    overall_counts = df_unique['age_group'].value_counts().sort_index()\n",
    "    overall_percentages = (overall_counts / overall_counts.sum() * 100).round(1)\n",
    "    \n",
    "    result['Overall'] = [f\"{overall_counts[age]} ({overall_percentages[age]}%)\" \n",
    "                      for age in counts.index]\n",
    "    \n",
    "    # Add totals row\n",
    "    totals = {}\n",
    "    for col in result.columns:\n",
    "        if col == 'Overall':\n",
    "            count = df_unique.shape[0]\n",
    "        else:\n",
    "            count = df_unique[df_unique['Cohort'] == col].shape[0]\n",
    "        totals[col] = f\"{count} (100.0%)\"\n",
    "    \n",
    "    result.loc['Total'] = totals\n",
    "    \n",
    "    return result\n",
    "\n",
    "# Example usage:\n",
    "age_distribution = age_group_distribution(df_char)\n",
    "print(age_distribution)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f8cf83e1-29ad-4975-b42b-f0ee9bb88d5c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
