{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "eafeca45",
   "metadata": {},
   "source": [
    "## Preprocess the DNA methylation (epigenomic) dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b471d75e-6845-4e01-89c0-b4a6a9a1680a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "162db2e1",
   "metadata": {},
   "source": [
    "We first need the path to our folder containing case-organized data and the destination for storing the processed epigenomic data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "53711c57",
   "metadata": {},
   "outputs": [],
   "source": [
    "ORGANIZED_BY_CASE_PATH = \".../TCGA/data_by_cases\"\n",
    "DESTINATION_DATA_PATH = \".../TCGA/data_processed/PRCSD_epigenomic_data.csv\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a86d3c9a",
   "metadata": {},
   "source": [
    "We use the following function to read in DNA methylation data. This function should be adapted to the format of DNA methylation data used for a project. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "13d68835",
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_methylation(filepath, case_id):\n",
    "    return pd.read_csv(filepath, sep='\\t', header = None).set_index(0).rename(columns={1:case_id})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e09b0a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_epi_dataset(by_cases_path):\n",
    "    cases = os.listdir(by_cases_path)\n",
    "    \"\"\"\n",
    "    Loop through every case filepath and search for epigenomic data. \n",
    "    Apply the read CSV function to each epigenomic data found. \n",
    "    After all the epigenomic files are read, we can concatenate them to create a matrix where rows are cases, columns are genomic regions, and values are the respective methylation values.\n",
    "    \"\"\"\n",
    "    epigenomic_data = []\n",
    "    l = len(cases)\n",
    "    for i, case in enumerate(cases):\n",
    "        #print(f\"Case {i}/{l}\")\n",
    "        contents_gene_meth = os.listdir(os.path.join(by_cases_path, case, \"dna_methylation\"))\n",
    "        if len(contents_gene_meth) == 0:\n",
    "            print(f\"{case} has no methylation data\")\n",
    "        else:\n",
    "            filename = contents_gene_meth[0]\n",
    "            path = os.path.join(by_cases_path, case, \"dna_methylation\", filename)\n",
    "            epigenomic_data.append(read_methylation(path, case)) \n",
    "    #We remove any features with NAs, transpose the matrix so cases are rows and features are columns, then finally reset the index to the case id.\n",
    "    all_epigenomic = pd.concat(epigenomic_data, axis = 1)\n",
    "    all_epigenomic = all_epigenomic.dropna()\n",
    "    all_epigenomic = all_epigenomic.transpose()\n",
    "    all_epigenomic = all_epigenomic.reset_index().rename(columns={\"index\": \"case_id\"})\n",
    "    return all_epigenomic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b45c8829",
   "metadata": {},
   "outputs": [],
   "source": [
    "epi_data = create_epi_dataset(ORGANIZED_BY_CASE_PATH)\n",
    "epi_data.to_csv(DESTINATION_DATA_PATH, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3087e141",
   "metadata": {},
   "source": [
    "---"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
