{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "2bced79b",
   "metadata": {},
   "source": [
    "## Create the simulation data and split it into test, val, train."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "db22181c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PyTorch Version:  1.11.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import torch\n",
    "print(\"PyTorch Version: \",torch.__version__)\n",
    "from torch.utils.data import TensorDataset"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "51ad886a",
   "metadata": {},
   "source": [
    "This notebook shows how to simulate 20 modalities as described by the paper. The first classification label is created by 20 random values that add up to 1. The second classification label is created by randomly selecting numbers that are all under 0.15 - this is done so that without inspecting all values together, it is difficult to tell which label a value belongs to. For example, 0.14 is less than 0.15, but it could also be a value that adds to 1. Each value is then vectorized by sampling randomly around the chosen number, such that a modality is a vector rather than a single number."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7be4b8b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(columns = list(range(20)))\n",
    "for i in range(1000):\n",
    "    l = np.random.dirichlet(np.ones(20),size=1)[0]\n",
    "    l2 = np.random.uniform(0,0.15,20)\n",
    "    std1 = l.std()\n",
    "    std2 = l2.std()\n",
    "    arr = []\n",
    "    arr2 = []\n",
    "    for j in range(len(l2)):\n",
    "        arr.append(np.random.uniform(l[j] - std1 ,l[j]  + std1,20))\n",
    "        arr2.append(np.random.uniform(l2[j] - std2 ,l2[j]  + std2,20))\n",
    "    \n",
    "    df_temp = pd.DataFrame([arr, arr2])\n",
    "    df = df.append(df_temp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9d4f8025",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.reset_index()\n",
    "df[\"label\"] = df[\"index\"]\n",
    "df = df.drop([\"index\"], axis=1)\n",
    "cols = list(set(df.columns) - set([\"label\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "b0fbe1cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "test = df.sample(n=200)\n",
    "train = df.drop(test.index)\n",
    "val = train.sample(n=200)\n",
    "train = train.drop(val.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "06e91d2e",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train, y_test, y_val = train[\"label\"], test[\"label\"], val[\"label\"]\n",
    "X_train, X_test, X_val = train.drop(\"label\", axis=1), test.drop(\"label\", axis=1), val.drop(\"label\", axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "id": "189f5011",
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \".../simulation_data_vectors/\"\n",
    "\n",
    "for i in range(20):\n",
    "    train_inputs = TensorDataset(torch.Tensor(np.array(list(X_train[i].values))), torch.Tensor(y_train.astype(int).values))\n",
    "    val_inputs = TensorDataset(torch.Tensor(np.array(list(X_val[i].values))), torch.Tensor(y_val.astype(int).values))\n",
    "    test_inputs = TensorDataset(torch.Tensor(np.array(list(X_test[i].values))), torch.Tensor(y_test.astype(int).values))\n",
    "\n",
    "    torch.save(train_inputs, path + \"train_modality_\" + str(i) +  \"_inputs.pt\")\n",
    "    torch.save(val_inputs, path + \"val_modality_\" + str(i) +  \"_inputs.pt\")\n",
    "    torch.save(test_inputs, path + \"test_modality_\" + str(i) +  \"_inputs.pt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "id": "98c3b164",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train.to_pickle(path + \"/X_train.pkl\")\n",
    "y_train.to_pickle(path + \"/y_train.pkl\")\n",
    "X_test.to_pickle(path + \"/X_test.pkl\")\n",
    "y_test.to_pickle(path + \"/y_test.pkl\")\n",
    "X_val.to_pickle(path + \"/X_val.pkl\")\n",
    "y_val.to_pickle(path + \"/y_val.pkl\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
