{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6b2b3405-adf1-4c85-9ab2-0fa9775245db",
   "metadata": {},
   "source": [
    "# Intro\n",
    "\n",
    "This notebook downloads and preprocesses the Leukemia treatment dataset used for experiments in \"Feature Selection in the Contrastive Analysis Setting\". The notebook will download any necessary files. Preprocessing steps mostly follow that of \"Probabilistic Contrastive Principal Component Analysis\" (https://github.com/andrewcharlesjones/pcpca)."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d8c9d411-5a35-4f9c-b32d-e00af1f4b16b",
   "metadata": {
    "tags": []
   },
   "source": [
    "### First we download the (compressed) data files from 10x genomics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28eda6e0-2b04-4739-a640-8ea5f27211f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import shutil\n",
    "\n",
    "def download_file(url, output_file_name):\n",
    "    r = requests.get(url)\n",
    "\n",
    "    with open(output_file_name + '.tar.gz', 'wb') as f:\n",
    "        f.write(r.content)\n",
    "\n",
    "    print(\"Data successfully written to disk\")\n",
    "\n",
    "urls = [\n",
    "    'https://cf.10xgenomics.com/samples/cell-exp/1.1.0/aml027_post_transplant/aml027_post_transplant_filtered_gene_bc_matrices.tar.gz',\n",
    "    'https://cf.10xgenomics.com/samples/cell-exp/1.1.0/aml027_pre_transplant/aml027_pre_transplant_filtered_gene_bc_matrices.tar.gz',\n",
    "    'https://cf.10xgenomics.com/samples/cell-exp/1.1.0/aml035_post_transplant/aml035_post_transplant_filtered_gene_bc_matrices.tar.gz',\n",
    "    'https://cf.10xgenomics.com/samples/cell-exp/1.1.0/aml035_pre_transplant/aml035_pre_transplant_filtered_gene_bc_matrices.tar.gz',\n",
    "    'https://cf.10xgenomics.com/samples/cell-exp/1.1.0/frozen_bmmc_healthy_donor1/frozen_bmmc_healthy_donor1_filtered_gene_bc_matrices.tar.gz',\n",
    "    'https://cf.10xgenomics.com/samples/cell-exp/1.1.0/frozen_bmmc_healthy_donor2/frozen_bmmc_healthy_donor2_filtered_gene_bc_matrices.tar.gz'\n",
    "]\n",
    "\n",
    "filenames = [\n",
    "    \"AML027_posttransplant_BMMCs\",\n",
    "    \"AML027_pretransplant_BMMCs\",\n",
    "    \"AML035_posttransplant_BMMCs\",\n",
    "    \"AML035_pretransplant_BMMCs\",\n",
    "    \"Frozen_BMMCs_HealthyControl1\",\n",
    "    \"Frozen_BMMCs_HealthyControl2\"\n",
    "]\n",
    "    \n",
    "for url, output_name in zip(urls, filenames):\n",
    "    download_file(url, output_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f4b3f4d-e3a5-4967-acd9-1b8840b05afd",
   "metadata": {},
   "source": [
    "### Next, we decompress the `.tar.gz` archives"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c78033e-3fbb-4fb9-b2e8-97d4f54590e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import shutil\n",
    "\n",
    "for filename in filenames:\n",
    "    shutil.unpack_archive(filename + \".tar.gz\", filename)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9eb6dbe-5fac-4e16-98f2-e006b454c866",
   "metadata": {},
   "source": [
    "### We read in the resulting files and convert them to pandas DataFrames\n",
    "\n",
    "(This may take up to a couple minutes depending on how powerful your computer is.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "decb67ce-1bac-40f5-873a-e0beb6a5b12b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.io import mmread\n",
    "\n",
    "from os.path import join\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "def read_data(directory):\n",
    "    data = mmread(join(directory, \"filtered_matrices_mex/hg19/matrix.mtx\")).toarray()\n",
    "    \n",
    "    # Log-transform the data, as is standard for gene expression data\n",
    "    data = np.log(data + 1)\n",
    "\n",
    "    genes = pd.read_table(join(directory, \"filtered_matrices_mex/hg19/genes.tsv\"), header=None)\n",
    "    barcodes = pd.read_table(join(directory, \"filtered_matrices_mex/hg19/barcodes.tsv\"), header=None)\n",
    "    data_df = pd.DataFrame(data, index=genes.iloc[:, 0].values, columns=barcodes.iloc[:, 0].values)\n",
    "\n",
    "    # Filtered out any cells or genes that are all zeros\n",
    "    data_df = data_df.iloc[:, np.sum(data_df.values, axis=0) != 0]\n",
    "    data_df = data_df.iloc[np.sum(data_df.values, axis=1) != 0, :]\n",
    "    \n",
    "    # The original data was saved with genes for rows and cells for columns. Here we transpose it to have\n",
    "    # cells as rows with genes as columns to match the more \"standard\" ML format of samples as rows\n",
    "    # with features as columns\n",
    "    data_df = data_df.transpose()\n",
    "    \n",
    "    return data_df\n",
    "\n",
    "post_transplant1 = read_data(filenames[0])\n",
    "pre_transplant1 = read_data(filenames[1])\n",
    "\n",
    "post_transplant2 = read_data(filenames[2])\n",
    "pre_transplant2 = read_data(filenames[3])\n",
    "\n",
    "healthy_control1 = read_data(filenames[4])\n",
    "healthy_control2 = read_data(filenames[5])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "36c294b2-6d22-43d8-a5fc-8996d77d41a5",
   "metadata": {},
   "source": [
    "### Next, we preprocess our dataframes a bit before combining them"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13ea0e09-bb7c-46c5-b53c-5533c6795231",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Fined genes common to all patients\n",
    "shared_genes = post_transplant1.columns.values\n",
    "\n",
    "for curr_df in [pre_transplant1, post_transplant2, pre_transplant2, healthy_control1, healthy_control2]:\n",
    "    shared_genes = np.intersect1d(shared_genes, curr_df.columns.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79695372-59b8-4476-98a1-6cd18f286523",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Combine individual dataframes into a single dataframe\n",
    "stacked_df = post_transplant1[shared_genes]\n",
    "for curr_df in [pre_transplant1, post_transplant2, pre_transplant2, healthy_control1, healthy_control2]:\n",
    "    stacked_df = pd.concat([stacked_df, curr_df[shared_genes]], axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "589fda1a-3b32-48be-ae69-53a62e455441",
   "metadata": {},
   "source": [
    "### Now that our data is preprocessed, we collect some metadata that may be useful for later analyses"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e7b83ad-c63d-4c64-a760-cb2d646f2492",
   "metadata": {},
   "outputs": [],
   "source": [
    "patient_ids = np.concatenate([\n",
    "    np.zeros(post_transplant1.shape[0]), # This pair of pre and post transplant cells were from same patient\n",
    "    np.zeros(pre_transplant1.shape[0]),\n",
    "    np.ones(post_transplant2.shape[0]), # Same thing with this pair\n",
    "    np.ones(pre_transplant2.shape[0]),\n",
    "    np.ones(healthy_control1.shape[0]) * 2, # Healthy control cells were from different patients\n",
    "    np.ones(healthy_control2.shape[0]) * 3,\n",
    "])\n",
    "\n",
    "conditions = [\n",
    "    [\"Post transplant\"] * post_transplant1.shape[0],\n",
    "    [\"Pre transplant\"] * pre_transplant1.shape[0],\n",
    "    [\"Post transplant\"] * post_transplant2.shape[0],\n",
    "    [\"Pre transplant\"] * pre_transplant2.shape[0],\n",
    "    [\"Healthy\"] * healthy_control1.shape[0],\n",
    "    [\"Healthy\"] * healthy_control2.shape[0],\n",
    "]\n",
    "\n",
    "# Flattens the above list of lists into a single list\n",
    "conditions = [item for sublist in conditions for item in sublist]\n",
    "\n",
    "metadata = pd.DataFrame({'patient_id': patient_ids, 'condition': conditions})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b222a10f-c78d-4a1c-b3ec-aabcd5da97ed",
   "metadata": {},
   "source": [
    "### Finally, we save our preprocessed data and metadata to disk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f1814a5-16aa-4d0d-805e-38a0e806a234",
   "metadata": {},
   "outputs": [],
   "source": [
    "stacked_df.to_csv(\"./data/AML/preprocessed_data.csv\")\n",
    "metadata.to_csv(\"./data/AML/metadata.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "94ceb361-5077-402e-859d-65c99cde5a33",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "98aaeed7-5974-4f1b-90cc-98475683e68a",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"./data/AML/preprocessed_data.csv\", index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "7ced1545-b570-4cc9-846b-0d3420abc259",
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata = pd.read_csv(\"./data/AML/metadata.csv\", index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "289006e9-7e57-4d43-866e-0a270058b9c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "data1 = data.values[metadata['condition'] == 'Healthy']\n",
    "data2 = data.values[metadata['condition'] == 'Post transplant']\n",
    "data3 = data.values[metadata['condition'] == 'Pre transplant']"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
