{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "aedd1f7c",
   "metadata": {},
   "source": [
    "## Find the intersection of patients across the five modalities and create a val-test-train split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9ecfe317",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import glob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8e30c37",
   "metadata": {},
   "outputs": [],
   "source": [
    "PRCD_DATA_PATH =  \".../TCGA/data_processed/\"\n",
    "COMBINED_DATA_PATH = \".../TCGA/combined/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "f8b812f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "trans = pd.read_csv(PRCD_DATA_PATH + \"PRCSD_transcriptomic_data.csv\")\n",
    "cnv = pd.read_csv(PRCD_DATA_PATH + \"PRCSD_cnv_data.csv\")\n",
    "clinical = pd.read_csv(PRCD_DATA_PATH + \"PRCSD_clinical_data.csv\")\n",
    "epi = pd.read_csv(PRCD_DATA_PATH + \"PRCSD_epigenomic_data.csv\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "07ddfb4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "image_list = glob.glob(PRCD_DATA_PATH + \"images/*.jpg\")\n",
    "images  = pd.DataFrame(image_list)\n",
    "images[0] = images[0].str.split(\"/\").str[-1].str[:-4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "c4d06140",
   "metadata": {},
   "outputs": [],
   "source": [
    "ids = list(set.intersection(*map(set,[clinical[\"case_id\"].values,\n",
    "                           trans[\"case_id\"].values,\n",
    "                           epi[\"case_id\"].values,\n",
    "                          cnv[\"case_id\"].values, images[0].values])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed8ab680",
   "metadata": {},
   "outputs": [],
   "source": [
    "comb = clinical[clinical[\"case_id\"].isin(ids)][[\"case_id\", \"y\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "3be61d70",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>case_id</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TCGA-55-A4DG</td>\n",
       "      <td>lung</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>TCGA-55-A492</td>\n",
       "      <td>lung</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>TCGA-75-7025</td>\n",
       "      <td>lung</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>TCGA-69-A59K</td>\n",
       "      <td>lung</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TCGA-BP-4804</td>\n",
       "      <td>kidney</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>TCGA-BP-5202</td>\n",
       "      <td>kidney</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>TCGA-B8-5550</td>\n",
       "      <td>kidney</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>TCGA-BP-4986</td>\n",
       "      <td>kidney</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TCGA-AS-3777</td>\n",
       "      <td>kidney</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>TCGA-A3-3317</td>\n",
       "      <td>kidney</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>TCGA-EU-5906</td>\n",
       "      <td>kidney</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        case_id       y\n",
       "0  TCGA-55-A4DG    lung\n",
       "1  TCGA-55-A492    lung\n",
       "2  TCGA-75-7025    lung\n",
       "3  TCGA-69-A59K    lung\n",
       "0  TCGA-BP-4804  kidney\n",
       "1  TCGA-BP-5202  kidney\n",
       "2  TCGA-B8-5550  kidney\n",
       "3  TCGA-BP-4986  kidney\n",
       "4  TCGA-AS-3777  kidney\n",
       "5  TCGA-A3-3317  kidney\n",
       "6  TCGA-EU-5906  kidney"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "comb.to_csv(COMBINED_DATA_PATH + \"combined_ids.csv\", index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b551d55e",
   "metadata": {},
   "outputs": [],
   "source": [
    "ten_percent = round(0.1 * len(comb))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "03a34c36",
   "metadata": {},
   "outputs": [],
   "source": [
    "test = comb.sample(n=ten_percent)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c476cb30",
   "metadata": {},
   "outputs": [],
   "source": [
    "val = comb[~comb[\"case_id\"].isin(test[\"case_id\"].values)].sample(n=ten_percent)\n",
    "val[\"y\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "f551d037",
   "metadata": {},
   "outputs": [],
   "source": [
    "cases_taken = list(test[\"case_id\"].values) + list(val[\"case_id\"].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8af7f92e",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = comb[~comb[\"case_id\"].isin(cases_taken)].sample(frac = 1)\n",
    "train[\"y\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "dd70fe1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "val[\"split\"] = \"val\"\n",
    "test[\"split\"] = \"test\"\n",
    "train[\"split\"] = \"train\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "id": "c6cbe380",
   "metadata": {},
   "outputs": [],
   "source": [
    "splits = val.append(test).append(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "5589e064",
   "metadata": {},
   "outputs": [],
   "source": [
    "splits.to_csv(COMBINED_DATA_PATH + \"splits.csv\", index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "8cb46640",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "save_path =  COMBINED_DATA_PATH + \"split_data/\"\n",
    "id_path = COMBINED_DATA_PATH + \"splits.csv\"\n",
    "# read in ids and data\n",
    "id_df = pd.read_csv(id_path) #or splits\n",
    "ids = id_df[\"case_id\"].tolist()\n",
    "categories = id_df[\"split\"].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "eb2fbc9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "modalities = {\"transcriptomic\": trans, \"cnv\": cnv, \"epigenomic\": epi, \"clinical\": clinical}\n",
    "for modality, data in modalities.items(): \n",
    "    \n",
    "    # create new DataFrames for ttv sets \n",
    "    train = pd.DataFrame(columns = list(data.columns.values))\n",
    "    test = pd.DataFrame(columns = list(data.columns.values))\n",
    "    val = pd.DataFrame(columns = list(data.columns.values))\n",
    "    \n",
    "    id_name = \"case_id\"\n",
    "    # For each case id:\n",
    "    for i in range(0, len(ids)):\n",
    "        case_id = ids[i]\n",
    "        cat = categories[i]\n",
    "    \n",
    "        # access row of id in data and add row to corresponding dataframe\n",
    "        if(cat == \"train\"):\n",
    "            newrow = data.loc[data[id_name] == case_id]\n",
    "            train = train.append(newrow, ignore_index=True)\n",
    "        elif(cat == \"test\"):\n",
    "            newrow = data.loc[data[id_name] == case_id]\n",
    "            test = test.append(newrow, ignore_index=True)\n",
    "        else:\n",
    "            newrow = data.loc[data[id_name] == case_id]\n",
    "            val = val.append(newrow, ignore_index=True)\n",
    "            \n",
    "    # Save new dataframes as csvs\n",
    "    train.to_csv(save_path + modality + \"_train.csv\", index=False)\n",
    "    test.to_csv(save_path + modality + \"_test.csv\", index=False)\n",
    "    val.to_csv(save_path + modality + \"_val.csv\", index=False)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
