{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5f0ec7cc-f5e7-4f8d-ae5d-f257e33390f7",
   "metadata": {},
   "source": [
    "# Intro\n",
    "\n",
    "This notebook preprocesses the mice protein dataset used for experiments in \"Feature Selection in the Contrastive Analysis Setting\". The `.csv` file used in this notebook can be downloaded from https://www.kaggle.com/ruslankl/mice-protein-expression. Preprocessing in this notebook largely follows that done in \"Concrete Autoencoders for Differentiable Feature Selection and Reconstruction\", (https://github.com/mfbalin/Concrete-Autoencoders/blob/master/experiments/generate_comparison_figures.py)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9fb0c7ad-c23b-45f6-aff4-90b3e26e0bd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "fdc2844e-e550-4e49-a2b7-b66960c72640",
   "metadata": {},
   "outputs": [],
   "source": [
    "one_hot = False\n",
    "filling_value = -100000\n",
    "\n",
    "data = np.genfromtxt('data/mice/Data_Cortex_Nuclear.csv', delimiter = ',', skip_header = 1, usecols = range(1, 78), filling_values = filling_value, encoding = 'UTF-8')\n",
    "classes = np.genfromtxt('data/mice/Data_Cortex_Nuclear.csv', delimiter = ',', skip_header = 1, usecols = range(78, 81), dtype = None, encoding = 'UTF-8')\n",
    "\n",
    "# Imputing missing values\n",
    "for i, row in enumerate(data):\n",
    "    for j, val in enumerate(row):\n",
    "        if val == filling_value:\n",
    "            data[i, j] = np.mean([data[k, j] for k in range(classes.shape[0]) if np.all(classes[i] == classes[k])])\n",
    "\n",
    "data = MinMaxScaler(feature_range=(0,1)).fit_transform(data)\n",
    "data = data.astype(np.float32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5d3f098c-5715-4913-92b2-9a7bd77aa5aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "target = data[np.where(classes[:,-1]=='S/C')[0]]\n",
    "background = data[(classes[:,-1]=='C/S') & (classes[:,-2]=='Saline') & (classes[:,-3]=='Control')]\n",
    "target_labels = classes[classes[:,-1]=='S/C']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "011b9e10-0b67-42ed-ba24-82e80468537b",
   "metadata": {},
   "outputs": [],
   "source": [
    "label_list = []\n",
    "for row in target_labels:\n",
    "    label_str = ''\n",
    "    if row[0] == 'Control':\n",
    "        label_str = label_str + 'C'\n",
    "    else:\n",
    "        label_str = label_str + 'T' #T for trisomy aka Down Syndrome\n",
    "        \n",
    "    if row[1] == 'Memantine':\n",
    "        label_str = label_str + 'M'\n",
    "    else:\n",
    "        label_str = label_str + 'S' #S for 'Saline' aka control treatment\n",
    "    label_list += [label_str]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f1b77561-ce3c-459a-b8b0-ec0b2b31399a",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = LabelEncoder().fit_transform(label_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "12597910-8111-41b2-a7d6-4cb5a40810b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save(\"data/mice/target.npy\", target)\n",
    "np.save(\"data/mice/background.npy\", background)\n",
    "np.save(\"data/mice/target_labels.npy\", labels)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
