{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "747902d7-bee3-4d31-a17e-5e4e1da9e1e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pickle\n",
    "import pandas as pd\n",
    "import glob\n",
    "from tqdm import tqdm, trange\n",
    "\n",
    "# math imports\n",
    "import numpy as np\n",
    "import scipy\n",
    "import sklearn\n",
    "\n",
    "from dataclasses import dataclass\n",
    "\n",
    "import matplotlib\n",
    "from matplotlib import pyplot\n",
    "\n",
    "import copy\n",
    "import sys\n",
    "sys.path.append('../../../')\n",
    "from src.data_class import matrix_class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "922b91ac-44cd-4942-b10b-3b77899ac83e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def has_numbers(inputString):\n",
    "    return any(char.isdigit() for char in inputString)\n",
    "\n",
    "import warnings\n",
    "warnings.simplefilter(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "d94b21cf-0fab-4ccb-91eb-6063fd8fa27b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Max age 21.899041, min age 5.003878\n",
      "Sex code : Male = 0, Female = 1\n"
     ]
    }
   ],
   "source": [
    "HBN_basic_demos = pd.read_csv(\"../9994_Basic_Demos_20210310.csv\", low_memory=False)\n",
    "HBN_basic_demos = HBN_basic_demos.iloc[1:,:]\n",
    "HBN_basic_demos[[\"Age\", \"Sex\"]] = HBN_basic_demos[[\"Age\", \"Sex\"]].apply(pd.to_numeric)\n",
    "min_age = np.min( HBN_basic_demos['Age'].values )\n",
    "max_age = np.max( HBN_basic_demos['Age'].values )\n",
    "\n",
    "metadata = ['EID', 'Sex', 'Age']\n",
    "\n",
    "print('Max age {}, min age {}'.format(max_age, min_age))\n",
    "print('Sex code : Male = 0, Female = 1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "6c26233f-8370-47f7-abbc-dd7df66ce18d",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataname = 'CBCL'\n",
    "filename = '../9994_{}_20210310.csv'.format(dataname)\n",
    "dict_filename = '../extra_questionnaire/dictionary/{}.xlsx'.format(dataname)\n",
    "\n",
    "data = pd.read_csv(filename, low_memory=False)\n",
    "data = data.iloc[1:,:]\n",
    "\n",
    "ID_index = list(data.columns).index('Anonymized ID')\n",
    "EID_index = list(data.columns).index('EID')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "17aaa571-90ac-4cce-8793-42adefc0bdc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "duplicates = data.loc[data['EID'].isin( data.loc[data.duplicated(subset=['EID'])]['EID'] )]\n",
    "duplicates = duplicates.sort_values(by='EID')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "b5e46104-f137-4b81-9536-12f33dac7a25",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CBCL_01 : 1. Acts too young for his/her age\n",
      "CBCL_02 : 2. Drinks alcohol without parents' approval\n",
      "CBCL_03 : 3. Argues a lot\n",
      "CBCL_04 : 4. Fails to finish things he/she starts\n",
      "CBCL_05 : 5. There is very little he/she enjoys\n",
      "CBCL_06 : 6. Bowel movements outside toilet\n",
      "CBCL_07 : 7. Bragging, boasting\n",
      "CBCL_08 : 8. Can't concentrate, can't pay attention for long\n",
      "CBCL_09 : 9. Can't get his/her mind off certain thoughts; obsessions\n",
      "CBCL_10 : 10. Can't sit still, restless or hyperactive\n",
      "CBCL_11 : 11. Clings to adults or too dependent\n",
      "CBCL_12 : 12. Complains of loneliness\n",
      "CBCL_13 : 13. Confused or seems to be in a fog\n",
      "CBCL_14 : 14. Cries a lot\n",
      "CBCL_15 : 15. Cruel to animals\n",
      "CBCL_16 : 16. Cruelty, bullying, or meanness to others\n",
      "CBCL_17 : 17. Daydreams or gets lost in his/her thoughts\n",
      "CBCL_18 : 18. Deliberately harms self or attempts suicide\n",
      "CBCL_19 : 19. Demands a lot of attention\n",
      "CBCL_20 : 20. Destroys his/her own things\n",
      "CBCL_21 : 21. Destroys things belonging to his/her family or others\n",
      "CBCL_22 : 22. Disobedient at home\n",
      "CBCL_23 : 23. Disobedient at school\n",
      "CBCL_24 : 24. Doesn't eat well\n",
      "CBCL_25 : 25. Doesn't get along well with other kids\n",
      "CBCL_26 : 26. Doesn't seem to feel guilty after misbehaving\n",
      "CBCL_27 : 27. Easily jealous\n",
      "CBCL_28 : 28. Breaks rules at home, school, or elsewhere\n",
      "CBCL_29 : 29. Fears certain animals, situations, or places, other than school\n",
      "CBCL_30 : 30. Fears going to school\n",
      "CBCL_31 : 31. Fears he/she might think or do something bad\n",
      "CBCL_32 : 32. Feels he/she has to be perfect\n",
      "CBCL_33 : 33. Feels or complains that no one loves him/her\n",
      "CBCL_34 : 34. Feels others are out to get him/her\n",
      "CBCL_35 : 35. Feels worthless or inferior\n",
      "CBCL_36 : 36. Gets hurt a lot, accident-prone\n",
      "CBCL_37 : 37. Gets in many fights\n",
      "CBCL_38 : 38. Gets teased a lot\n",
      "CBCL_39 : 39. Hangs around with others who get in trouble\n",
      "CBCL_40 : 40. Hears sounds or voices that aren't there\n",
      "CBCL_41 : 41. Impulsive or acts without thinking\n",
      "CBCL_42 : 42. Would rather be along than with others\n",
      "CBCL_43 : 43. Lying or cheating\n",
      "CBCL_44 : 44. Bites fingernails\n",
      "CBCL_45 : 45. Nervous, highstrung, or tense\n",
      "CBCL_46 : 46. Nervous movements or twitching\n",
      "CBCL_47 : 47. Nightmares\n",
      "CBCL_48 : 48. Not liked by other kids\n",
      "CBCL_49 : 49. Constipated, doesn't move bowels\n",
      "CBCL_50 : 50. Too fearful or anxious\n",
      "CBCL_51 : 51. Feels dizzy or lightheaded\n",
      "CBCL_52 : 52. Feels too guilty\n",
      "CBCL_53 : 53. Overeating\n",
      "CBCL_54 : 54. Overtired without good reason\n",
      "CBCL_55 : 55. Overweight\n",
      "CBCL_56A :      56A. Aches or pains (not stomach or headaches)\n",
      "CBCL_56B :      56B. Headaches\n",
      "CBCL_56C :      56C. Nausea, feels sick\n",
      "CBCL_56D :      56D.A. Problems with eyes (not if corrected by glasses\n",
      "CBCL_56E :      56E. Rashes or other skin problems\n",
      "CBCL_56F :      56F. Stomachaches\n",
      "CBCL_56G :      56G. Vomiting, throwing up\n",
      "CBCL_56H :      56H.A. Other\n",
      "CBCL_57 : 57. Physically attacks people\n",
      "CBCL_58 : 58. Picks nose, skin, or other parts of body\n",
      "CBCL_59 : 59. Plays with own sex parts in public\n",
      "CBCL_60 : 60. Plays with own sex parts too much\n",
      "CBCL_61 : 61. Poor school work\n",
      "CBCL_62 : 62. Poorly coordinated or clumsy\n",
      "CBCL_63 : 63. Prefers being with older kids\n",
      "CBCL_64 : 64. Prefers being with younger kids\n",
      "CBCL_65 : 65. Refuses to talk\n",
      "CBCL_66 : 66. Repeats certain acts over and over; compulsions \n",
      "CBCL_67 : 67. Runs away from home\n",
      "CBCL_68 : 68. Screams a lot\n",
      "CBCL_69 : 69. Secretive, keeps things to self\n",
      "CBCL_70 : 70. Sees things that aren't there\n",
      "CBCL_71 : 71. Self-conscious or easily embarrassed\n",
      "CBCL_72 : 72. Sets fires\n",
      "CBCL_73 : 73. Sexual problems\n",
      "CBCL_74 : 74. Showing off or clowning\n",
      "CBCL_75 : 75. Too shy or timid\n",
      "CBCL_76 : 76. Sleeps less than most kids\n",
      "CBCL_77 : 77. Sleeps more than most kids during day and/or night\n",
      "CBCL_78 : 78. Inattentive or easily distracted\n",
      "CBCL_79 : 79. Speech problem\n",
      "CBCL_80 : 80. Stares blankly \n",
      "CBCL_81 : 81. Steals at home\n",
      "CBCL_82 : 82. Steals outside the home\n",
      "CBCL_83 : 83. Stores up too many things he/she doesn't need\n",
      "CBCL_84 : 84. Strange behavior\n",
      "CBCL_85 : 85. Strange ideas\n",
      "CBCL_86 : 86. Stubborn, sullen, or irritable\n",
      "CBCL_87 : 87. Sudden changes in mood or feelings\n",
      "CBCL_88 : 88. Sulks a lot\n",
      "CBCL_89 : 89. Suspicious\n",
      "CBCL_90 : 90. Swearing or obscene language\n",
      "CBCL_91 : 91. Talks about killing self\n",
      "CBCL_92 : 92. Talks or walks in sleep\n",
      "CBCL_93 : 93. Talks too much\n",
      "CBCL_94 : 94. Teases a lot\n",
      "CBCL_95 : 95. Temper tantrums or hot temper\n",
      "CBCL_96 : 96. Thinks about sex too much\n",
      "CBCL_97 : 97. Threatens people\n",
      "CBCL_98 : 98. Thumb-sucking\n",
      "CBCL_99 : 99. Smokes, chews, or sniffs tobacco\n",
      "CBCL_100 : 100. Trouble sleeping \n",
      "CBCL_101 : 101. Truancy, skips school\n",
      "CBCL_102 : 102. Underactive, slow moving, or lacks energy\n",
      "CBCL_103 : 103. Unhappy, sad, or depressed\n",
      "CBCL_104 : 104. Unusually loud\n",
      "CBCL_105 : 105. Uses drugs for nonmedical purposes (don't include alcohol or tobacco)\n",
      "CBCL_106 : 106. Vandalism\n",
      "CBCL_107 : 107. Wets self during the day\n",
      "CBCL_108 : 108. Wets the bed\n",
      "CBCL_109 : 109. Whining\n",
      "CBCL_110 : 110. Wishes to be of opposite sex\n",
      "CBCL_111 : 111. Withdrawn, doesn't get inolved with others\n",
      "CBCL_112 : 112. Worries\n",
      "Unique subjects : 3094\n",
      "Data size : 3094 x 122\n"
     ]
    }
   ],
   "source": [
    "response_index = [(dataname in col) and\n",
    "                  (has_numbers(col.replace(dataname+'_', ''))) and\n",
    "                  ('113' not in col)\n",
    "                  for col in data.columns]\n",
    "response_index = list(np.where(response_index)[0])\n",
    "response_abbre = [data.columns[idx] for idx in response_index]\n",
    "# print(','.join(response_abbre))\n",
    "\n",
    "dictionary = pd.read_excel(dict_filename, header=1)\n",
    "response_info = [dictionary.loc[dictionary['Variable']==v]['Question'].values.squeeze() for v in response_abbre]\n",
    "response = list(zip(response_abbre, response_info))\n",
    "assert len(response) > 0\n",
    "print('\\n'.join([r[0] + ' : ' + r[1] for r in response]))\n",
    "\n",
    "data[response_abbre] = data[response_abbre].apply(pd.to_numeric)\n",
    "df = data.copy()\n",
    "df = df.iloc[:, [EID_index]+response_index]\n",
    "df = df.merge(HBN_basic_demos[metadata], 'left')\n",
    "df[['Sex', 'Age']] = df[['Sex', 'Age']].apply(pd.to_numeric)\n",
    "df.insert(0, 'Sex', df.pop('Sex'))\n",
    "df.insert(0, 'Age', df.pop('Age'))\n",
    "\n",
    "# ===\n",
    "df = df.drop_duplicates(subset=['Age', 'EID']+response_abbre)\n",
    "df = df.drop_duplicates(subset=['EID'], keep='last')\n",
    "# ===\n",
    "\n",
    "assert np.sum(np.isnan(df[['Sex', 'Age']].values)) == 0\n",
    "\n",
    "print('Unique subjects : {}'.format(np.unique(df['EID'].values).shape[0]))\n",
    "print('Data size : {} x {}'.format(df.shape[0], df.shape[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d0808ed-3a71-42f8-badc-8cd522bf7b4e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "89e6e328-a0a6-42fd-81a0-aad0b2fe2f30",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CBCL_AD : Anxious/Depressed Raw Score\n",
      "CBCL_AD_T : Anxious/Depressed T Score\n",
      "CBCL_WD : Withdrawn/Depressed Raw Score\n",
      "CBCL_WD_T : Withdrawn/Depressed T Score\n",
      "CBCL_SC : Somatic Complaints Raw Score\n",
      "CBCL_SC_T : Somatic Complaints T Score\n",
      "CBCL_SP : Social Problems Raw Score\n",
      "CBCL_SP_T : Social Problems T Score\n",
      "CBCL_TP : Thought Problems Raw Score\n",
      "CBCL_TP_T : Thought Problems T Score\n",
      "CBCL_AP : Attention Problems Raw Score\n",
      "CBCL_AP_T : Attention Problems T Score\n",
      "CBCL_RBB : Rule Breaking Behavior Raw Score\n",
      "CBCL_RBB_T : Rule Breaking Behavior T Score\n",
      "CBCL_AB : Aggressive Behavior Raw Score\n",
      "CBCL_AB_T : Aggressive Behavior T Score\n",
      "CBCL_OP : Other Problems Raw Score\n",
      "CBCL_Int : Internalizing Raw Score\n",
      "CBCL_Int_T : Internalizing T Score\n",
      "CBCL_Ext : Externalizing Raw Score\n",
      "CBCL_Ext_T : Externalizing T Score\n",
      "CBCL_C : C Score Raw Score\n",
      "CBCL_Total : Total Raw Score\n",
      "CBCL_TOTAL_T : Total T Score\n",
      "Unique subjects : 3094\n",
      "Data size : 3094 x 27\n"
     ]
    }
   ],
   "source": [
    "subscale_abbre = ['CBCL_AD', 'CBCL_AD_T',\n",
    "                  'CBCL_WD', 'CBCL_WD_T',\n",
    "                  'CBCL_SC', 'CBCL_SC_T',\n",
    "                  'CBCL_SP', 'CBCL_SP_T',\n",
    "                  'CBCL_TP', 'CBCL_TP_T',\n",
    "                  'CBCL_AP', 'CBCL_AP_T',\n",
    "                  'CBCL_RBB', 'CBCL_RBB_T',\n",
    "                  'CBCL_AB', 'CBCL_AB_T',\n",
    "                  'CBCL_OP', 'CBCL_Int',\n",
    "                  'CBCL_Int_T', 'CBCL_Ext',\n",
    "                  'CBCL_Ext_T', 'CBCL_C',\n",
    "                  'CBCL_Total', 'CBCL_TOTAL_T'\n",
    "                 ]\n",
    "subscale_index = [list(data.columns).index(s) for s in subscale_abbre]\n",
    "subscale_info = [dictionary.loc[dictionary['Variable']==v]['Question'].values.squeeze() for v in subscale_abbre]\n",
    "subscale_info = [str(s) for s in subscale_info]\n",
    "subscale_info[-1] = 'Total T Score'\n",
    "\n",
    "subscale = list(zip(subscale_abbre, subscale_info))\n",
    "\n",
    "print('\\n'.join([s[0] + ' : ' + s[1] for s in subscale]))\n",
    "\n",
    "data[subscale_abbre] = data[subscale_abbre].apply(pd.to_numeric)\n",
    "df_subscale = data.copy()\n",
    "df_subscale = df_subscale.iloc[:, [EID_index]+subscale_index]\n",
    "df_subscale = df_subscale.merge(HBN_basic_demos[metadata], 'left')\n",
    "df_subscale[['Sex', 'Age']] = df_subscale[['Sex', 'Age']].apply(pd.to_numeric)\n",
    "df_subscale.insert(0, 'Sex', df_subscale.pop('Sex'))\n",
    "df_subscale.insert(0, 'Age', df_subscale.pop('Age'))\n",
    "\n",
    "# ===\n",
    "df_subscale = df_subscale.drop_duplicates(subset=['Age', 'EID']+subscale_abbre)\n",
    "df_subscale = df_subscale.drop_duplicates(subset=['EID'], keep='last')\n",
    "# ===\n",
    "\n",
    "assert np.sum(np.isnan(df_subscale[['Sex', 'Age']].values)) == 0\n",
    "\n",
    "print('Unique subjects : {}'.format(np.unique(df_subscale['EID'].values).shape[0]))\n",
    "print('Data size : {} x {}'.format(df_subscale.shape[0], df_subscale.shape[1]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "ae56f67c-ed9e-47fd-92f0-9b6edfa70bc0",
   "metadata": {},
   "outputs": [],
   "source": [
    "max_response = 2.0\n",
    "min_response = 0.0\n",
    "\n",
    "max_subscale = np.nanmax(df_subscale[subscale_abbre].values, axis=0)\n",
    "min_subscale = np.nanmin(df_subscale[subscale_abbre].values, axis=0)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76a3f811-cf72-4cec-8ba4-ee74b1c29585",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "01366b72-d908-468d-a5af-75447c0a15d0",
   "metadata": {},
   "source": [
    "### Load CBCL questions information (2001 version)\n",
    "- The questions are the 2001 version (same for both ABCD and HBN)\n",
    "- The third column in the original csv file: `CBCL2001_item_variables.csv` corresponds to the grouping of questions\n",
    "- Typo fixed for item 56e and a new csv file `CBCL2001_item_variables_fixed.csv` was created\n",
    "- Introduce a new column to encode questions' group ID. For instance, 4-002 stands for Question Group 4, second question"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "d275e57b-477e-495d-adf2-1f279b56a2da",
   "metadata": {},
   "outputs": [],
   "source": [
    "CBCL_question = pd.read_csv('../CBCL2001_item_variables_fixed.csv', index_col=False)\n",
    "CBCL_question = CBCL_question.iloc[:-1,:]\n",
    "QuestionClass = list(np.unique(CBCL_question['CBCL2001_6-18_scale'].values))\n",
    "group_index = []\n",
    "for k in range(CBCL_question.shape[0]):\n",
    "    qid = CBCL_question.loc[k]['CBCL2001_6-18_varname'].split('.')[0]\n",
    "    group_index.append(str(QuestionClass.index(CBCL_question.loc[k]['CBCL2001_6-18_scale'])) + '-' + qid)\n",
    "CBCL_question['group'] = group_index\n",
    "question = CBCL_question['CBCL2001_6-18_varname'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "50bce442-d321-4e1d-8f2a-0c6f5b916e26",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.makedirs('../processed/pickle', exist_ok=True)\n",
    "os.makedirs('../processed/dataclass', exist_ok=True)\n",
    "os.makedirs('../processed/npz', exist_ok=True)\n",
    "\n",
    "with open(os.path.join('../processed/pickle', '{}.pickle'.format(dataname)), 'wb') as handle:\n",
    "    pickle.dump({'data':df,\n",
    "                 'data_info':'Child Behavior Checklist (CBCL) -- Age 6-18',\n",
    "                 'response_info':response,\n",
    "                 'CBCL_question': CBCL_question,\n",
    "                 'data_subscale':df_subscale,\n",
    "                 'subscale_info':subscale,\n",
    "                 'max_response':max_response,\n",
    "                 'min_response':min_response,\n",
    "                 'max_subscale':max_subscale,\n",
    "                 'min_subscale':min_subscale,\n",
    "                 'max_age':max_age,\n",
    "                 'min_age':min_age\n",
    "                },\n",
    "                handle,\n",
    "                protocol=4)\n",
    "    \n",
    "itemlist = [r[0] for r in response]\n",
    "subjlist = list(df['EID'].values)\n",
    "    \n",
    "M_raw = df[itemlist].values\n",
    "nan_mask = 1.0 - np.isnan(M_raw)\n",
    "\n",
    "M = copy.deepcopy(M_raw)\n",
    "M[np.isnan(M)] = min_response\n",
    "M = (M - min_response)/(max_response - min_response)\n",
    "assert np.sum(np.isnan(M)) == 0\n",
    "\n",
    "confound_raw = df[['Age', 'Sex']].values\n",
    "confound = copy.deepcopy(confound_raw)\n",
    "confound[:,0] = (confound[:,0] - min_age)/(max_age - min_age)\n",
    "assert np.sum(np.isnan(confound)) == 0\n",
    "\n",
    "matrix = matrix_class(M, M_raw,\n",
    "                      confound, confound_raw,\n",
    "                      nan_mask,\n",
    "                      None, None, None, # row_idx, col_idx, mask\n",
    "                      dataname,\n",
    "                      subjlist,\n",
    "                      response,\n",
    "                      None, None, # W, Q\n",
    "                      None, None, # C, Qc\n",
    "                      None, None) # Z, aZ\n",
    "\n",
    "with open(os.path.join('../processed/dataclass', '{}.pickle'.format(dataname)), 'wb') as handle:\n",
    "    pickle.dump(matrix, handle, protocol=4)\n",
    "    \n",
    "np.savez(os.path.join('../processed/npz','{}.npz'.format(dataname)),\n",
    "         M = matrix.M,\n",
    "         nan_mask = matrix.nan_mask,\n",
    "         cofounder = matrix.confound)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab5de6d1-42fc-4295-a665-caab7ba26872",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9bbd2327-ef3e-4c48-a1cc-f7a54446ee0a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "092dd14a-59e9-4aa6-8528-6b2790150df9",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "“deepMF_test”",
   "language": "python",
   "name": "deepmf_test"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
