{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "df4f8070",
   "metadata": {},
   "source": [
    "## Preprocess the clinical dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a5270045",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ca29efbc",
   "metadata": {},
   "source": [
    "The clinical data can be obtained by viewing one's cart on the GDC client, and selecting the \"Clinical\" download icon. The downloaded folder contains three .tsv files, one of which is \"clinical.tsv\". First, define the path to our clinical data and where to store our processed dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90e3ddc5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_clinical_csv(data_path):\n",
    "    clinical_data_raw = pd.read_csv(data_path, sep = '\\t')\n",
    "    clinical_data_raw = clinical_data_raw.drop_duplicates(subset='case_submitter_id', keep=\"last\")\n",
    "    #create target variable\n",
    "    clinical_data_raw[\"y\"] = clinical_data_raw[\"project_id\"].map({\"TCGA-LIHC\":\"liver\", \"TCGA-STAD\":\"stomach\", \"TCGA-COAD\" : \"colon\",\"TCGA-KIRC\":\"kidney\", \"TCGA-LUAD\":\"lung\" })\n",
    "    # fill in NAs for extraneous values\n",
    "    clinical_data_nas=clinical_data_raw.mask(clinical_data_raw == '\\'--')\n",
    "    # filter out columns that have NAs\n",
    "    clinical_data_no_nas = clinical_data_nas.loc[:, ~clinical_data_nas.isnull().any()].reset_index(drop = True)\n",
    "    #IMPORTANT - when combining multiple TCGA projects (kidney, lung, stomach, etc.), there will be columns that have those labels in them.\n",
    "    #Before expanding the categorical columns, you need to drop columns that will indicate the label and give away the prediction task.\n",
    "    drop_columns = [\"tissue_or_organ_of_origin\", \"site_of_resection_or_biopsy\"]\n",
    "    clinical_data_no_nas = clinical_data_no_nas.drop(drop_columns,axis=1)\n",
    "    #There are still several features that have very little variation (same labels across most cases). \n",
    "    #We can determine coefficients of variation for the categorical features to determine ones that may be more informative for analysis.\n",
    "    cvs = {}\n",
    "    for column in clinical_data_no_nas.columns:\n",
    "        if clinical_data_no_nas[column].dtype == 'object':\n",
    "            frequencies = clinical_data_no_nas[column].value_counts()\n",
    "            std_dev = frequencies.std()\n",
    "            mean = frequencies.mean()\n",
    "            CV = (std_dev / mean) * 100\n",
    "            cvs[column] = CV\n",
    "    #plt.barh(range(len(cvs)), list(cvs.values()), tick_label=list(cvs.keys())) #optional to plot the distributions\n",
    "    categorical = []\n",
    "    #We can determine a specific threshold (here it is 30) to isolate variable features. \n",
    "    #Note that if the threshold is close to 20, then it could pick-up the \"project_id\" and \"y\" columns, which we do not want. \n",
    "    for var in cvs:\n",
    "        if cvs[var] > 30:\n",
    "            categorical.append(var)\n",
    "    print(categorical)\n",
    "    \n",
    "    dfs = []\n",
    "    for col in categorical:\n",
    "        dfs.append(pd.get_dummies(clinical_data_no_nas[col], prefix = col, drop_first = True))\n",
    "    categorical_dummies = pd.concat(dfs, axis = 1)\n",
    "    c_data = categorical_dummies.assign(case_id=clinical_data_no_nas[\"case_submitter_id\"], y=clinical_data_no_nas[\"y\"])\n",
    "    return c_data\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3fbf8e53",
   "metadata": {},
   "outputs": [],
   "source": [
    "CLINICAL_DATA_PATH = \".../TCGA/clinical.tsv\"\n",
    "DESTINATION_DATA_PATH = \".../TCGA/data_processed/PRCSD_clinical_data.csv\"\n",
    "clinical_data = create_clinical_csv(CLINICAL_DATA_PATH)\n",
    "clinical_data.to_csv(DESTINATION_DATA_PATH, index = False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
