{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "aedd1f7c",
   "metadata": {},
   "source": [
    "## In this notebook, we do feature reduction for gene expression (transcriptomic), CNV, and DNA Methylation (epigenomic) datasets. We then convert all five modalities into Pytorch tensors."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9ecfe317",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('../..')\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.feature_selection import SelectFromModel\n",
    "from PIL import Image\n",
    "from torchvision import transforms\n",
    "from common_files.custom_sets import TCGA_TabDataset, TCGA_ImgDataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8e30c37",
   "metadata": {},
   "outputs": [],
   "source": [
    "COMBINED_DATA_PATH = \".../TCGA/combined/\"\n",
    "\n",
    "access_path =  COMBINED_DATA_PATH + \"split_data/\"\n",
    "save_path =  access_path + \"reduced/\"\n",
    "id_path = COMBINED_DATA_PATH + \"splits.csv\"\n",
    "tensor_path =  COMBINED_DATA_PATH + \"tensor_data/\"\n",
    "\n",
    "image_path = \".../TCGA/data_processed/images/\"\n",
    "\n",
    "id_order = pd.read_csv(id_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3581cf60",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Since transcriptomic, CNV, and Epigenomic datasets are large, we use a RandomForest as a feature reduciton method (only showing the RF the train data)\n",
    "modalities = [\"transcriptomic\", \"cnv\", \"epigenomic\"] \n",
    "\n",
    "y_train = id_order[id_order[\"split\"] == \"train\"][\"y\"]\n",
    "\n",
    "for modality in modalities:\n",
    "    train = pd.read_csv(access_path + modality + \"_train.csv\")\n",
    "    test = pd.read_csv(access_path + modality + \"_test.csv\")\n",
    "    val = pd.read_csv(access_path + modality + \"_val.csv\")\n",
    "\n",
    "    case = \"case_id\"\n",
    "\n",
    "    X_train = train.drop(columns=[case])\n",
    "    for est in [50, 100, 150]:\n",
    "        sel = SelectFromModel(RandomForestClassifier(n_estimators = est))\n",
    "        sel.fit(X_train, y_train)\n",
    "        selected_feat= X_train.columns[(sel.get_support())]\n",
    "\n",
    "        new_train = pd.DataFrame().assign(case_id=train[case])\n",
    "        new_test = pd.DataFrame().assign(case_id=test[case])\n",
    "        new_val = pd.DataFrame().assign(case_id=val[case])\n",
    "\n",
    "        for i in selected_feat:\n",
    "            new_train[i]=train[i]\n",
    "            new_test[i]=test[i]\n",
    "            new_val[i]=val[i]\n",
    "\n",
    "        print(len(new_train.columns))   \n",
    "        print(len(new_test.columns))   \n",
    "        print(len(new_val.columns))   \n",
    "\n",
    "        new_train.to_csv(save_path + modality + \"_\" + str(est) +\"_train.csv\", index=False)\n",
    "        new_test.to_csv(save_path + modality +\"_\" + str(est) + \"_test.csv\", index=False)\n",
    "        new_val.to_csv(save_path + modality + \"_\" + str(est) +\"_val.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "773319ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "modalities_to_convert = [\"transcriptomic\", \"cnv\", \"epigenomic\", \"transcriptomic\", \"clinical\"] # no need to write images\n",
    "sets = [\"train\", \"test\", \"val\"]\n",
    "est = [50,100,150]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d75e4aeb",
   "metadata": {},
   "outputs": [],
   "source": [
    "for m in modalities_to_convert:\n",
    "    for s in sets:\n",
    "        if m != \"clinical\":\n",
    "            for e in est:\n",
    "                dataset = TCGA_TabDataset(m, s, e, COMBINED_DATA_PATH)\n",
    "                torch.save(dataset, tensor_path + m + \"_\" + str(e) + \"_\"  + s + \"_inputs.pt\") \n",
    "        else:\n",
    "            dataset = TCGA_TabDataset(m, s, 0, COMBINED_DATA_PATH)\n",
    "            torch.save(dataset, tensor_path + m + \"_\" + s + \"_inputs.pt\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d54153ba",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "id": "27dd2893",
   "metadata": {},
   "outputs": [],
   "source": [
    "id_order.y = id_order.y.map(dict(lung =1, kidney =3, liver =2, stomach=1, colon=0)).astype(int)\n",
    "ids = id_order[\"case_id\"].tolist()\n",
    "cat = id_order[\"split\"].tolist()\n",
    "c_type = id_order[\"y\"].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "id": "e02c61c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "id_order[\"path\"] = image_path + id_order[\"case_id\"] + \".jpg\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "id": "41307a1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_data(data, split):\n",
    "    path = tensor_path + \"image\" +\"_\"+ split + \"_\" + \"inputs.pt\"\n",
    "    torch.save(data, path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "id": "1f63cc72",
   "metadata": {},
   "outputs": [],
   "source": [
    "for split in [\"train\", \"test\", \"val\"]:\n",
    "    \n",
    "    df = id_order[id_order[\"split\"] == split].reset_index(drop=True)\n",
    "        \n",
    "    img_input = TCGA_ImgDataset(\n",
    "        data_frame= df,\n",
    "        transform=transforms.Compose([\n",
    "                    transforms.ToTensor(),\n",
    "                    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),\n",
    "                    transforms.Resize((120, 160))])\n",
    "    )\n",
    "    save_data(img_input, split)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
