{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "eafeca45",
   "metadata": {},
   "source": [
    "## Preprocess the copy-number variation (CNV) dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b471d75e-6845-4e01-89c0-b4a6a9a1680a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c554aa79",
   "metadata": {},
   "source": [
    "We first need the path to our folder containing case-organized data and the destination for storing the processed CNV data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9aab704a",
   "metadata": {},
   "outputs": [],
   "source": [
    "ORGANIZED_BY_CASE_PATH = \".../TCGA/data_by_cases\"\n",
    "DESTINATION_DATA_PATH = \".../TCGA/data_processed/PRCSD_cnv_data.csv\"\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3083484a",
   "metadata": {},
   "source": [
    "We use the following function to read in CNV data. This function should be adapted to the format of CNV data used for a project. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "13d68835",
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_cnv(filepath, case_id):\n",
    "    arr = []\n",
    "    with open(filepath) as f:\n",
    "        lines = f.readlines()\n",
    "        for l in lines:\n",
    "            arr.append(l.upper().split())\n",
    "    # transform 2d array into dataframe\n",
    "    matrix = pd.DataFrame(arr)\n",
    "    # get gene names as column names\n",
    "    matrix.columns = matrix.iloc[0]\n",
    "    # drop the column\n",
    "    matrix = matrix.drop(0)\n",
    "    # replace missing values with -1\n",
    "    matrix[\"COPY_NUMBER\"].fillna(\"-1\", inplace = True)\n",
    "    # transpose matrix and set ID to gene_ID\n",
    "    matrix= matrix[[\"GENE_ID\", \"COPY_NUMBER\"]].set_index(\"GENE_ID\").transpose()\n",
    "    # rename copy number column with case IDs\n",
    "    return matrix.rename(columns={'GENE_ID': 'CASE_ID'},index={'COPY_NUMBER': case_id}).reset_index().rename(columns={0:'CASE_ID'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b123c8fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess_cnv(by_case_path): \n",
    "    cases = os.listdir(by_case_path)\n",
    "    #Loop through every case filepath and search for CNV data. Apply the read CSV function to each CNV data found.\n",
    "    #After all the CNV files are read, we can concatenate them to create a matrix where rows are cases, columns are genomic regions, and values are the respective copy numbers.\n",
    "    _cnv_data = []\n",
    "    i=0\n",
    "    for case in cases:\n",
    "        contents_gene_exp = os.listdir(os.path.join(by_case_path, case, \"cnv\"))\n",
    "        print(contents_gene_exp)\n",
    "        if len(contents_gene_exp) == 0:\n",
    "            i+=1\n",
    "            print(f\"{case} has no CNV expression data\")\n",
    "        else:\n",
    "            filename = contents_gene_exp[0]\n",
    "            path = os.path.join(by_case_path, case, \"cnv\", filename)\n",
    "            _cnv_data.append(read_cnv(path, case))\n",
    "\n",
    "    all_cnv_data = pd.concat(_cnv_data)\n",
    "    # reset index to case ID\n",
    "    all_cnv_data = all_cnv_data.rename(columns={\"CASE_ID\":\"case_id\"}).set_index(\"case_id\")\n",
    "\n",
    "    #For some preliminary feature reduction, we drop any columns that only have one unique value or have missing values.\n",
    "    i = 0\n",
    "    to_drop = []\n",
    "    for col in all_cnv_data.columns:\n",
    "        if len(all_cnv_data[col].unique())== 1 or ('-1' in all_cnv_data[col].unique()):\n",
    "            to_drop.append(col)\n",
    "            i+=1\n",
    "            # print(col)\n",
    "\n",
    "    print(f\"{i} columns in data will be dropped, out of {len(all_cnv_data.columns)}\")\n",
    "    all_cnv_data= all_cnv_data.drop(columns = to_drop)\n",
    "    return all_cnv_data\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05ddc0ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "cnv_data = preprocess_cnv(ORGANIZED_BY_CASE_PATH)\n",
    "cnv_data.to_csv(DESTINATION_DATA_PATH)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3fd9c720",
   "metadata": {},
   "source": [
    "---"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
