{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#! pip install matplotlib\n",
    "#!pip install scipy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, warnings\n",
    "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}\n",
    "warnings.filterwarnings('ignore') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np, pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import scipy\n",
    "import sys \n",
    "import time\n",
    "import joblib\n",
    "import multiprocessing\n",
    "import random\n",
    "\n",
    "\n",
    "sys.path.insert(0, './metrics/')\n",
    "# from discriminative_metrics3 import discriminative_score_metrics\n",
    "from discriminative_metrics2 import discriminative_score_metrics\n",
    "\n",
    "from predictive_metrics3 import predictive_score_metrics\n",
    "from visualization_metrics import visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "orig_data_dir = \"../../data/processed_orig_data/\"\n",
    "gen_data_dir = \"../../data/generated_data/\"\n",
    "\n",
    "scores_dir = './scores/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class MinMaxScaler():\n",
    "    \"\"\"Min Max normalizer.\n",
    "    Args:\n",
    "    - data: original data\n",
    "\n",
    "    Returns:\n",
    "    - norm_data: normalized data\n",
    "    \"\"\"\n",
    "    def fit_transform(self, data): \n",
    "        self.fit(data)\n",
    "        scaled_data = self.transform(data)\n",
    "        return scaled_data\n",
    "\n",
    "\n",
    "    def fit(self, data):    \n",
    "        self.mini = np.min(data, 0)\n",
    "        self.range = np.max(data, 0) - self.mini\n",
    "        return self\n",
    "        \n",
    "\n",
    "    def transform(self, data):\n",
    "        numerator = data - self.mini\n",
    "        scaled_data = numerator / (self.range + 1e-7)\n",
    "        return scaled_data\n",
    "\n",
    "    \n",
    "    def inverse_transform(self, data):\n",
    "        data *= self.range\n",
    "        data += self.mini\n",
    "        return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def confidence_interval(data, confidence=0.95):\n",
    "    a = 1.0 * np.array(data)\n",
    "    n = len(a)\n",
    "    m, se = np.mean(a), scipy.stats.sem(a)\n",
    "    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)\n",
    "    return m, h"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_list_into_lists(list_of_items, num_splits):\n",
    "    if len(list_of_items) % num_splits == 0: \n",
    "        num_per_split = (len(list_of_items) // num_splits) \n",
    "    else: \n",
    "        num_per_split = (len(list_of_items) // num_splits) + 1\n",
    "\n",
    "    list_of_split_lists = []\n",
    "    for i in range(num_splits):\n",
    "        list_of_split_lists.append(list_of_items[i * num_per_split : (i + 1) * num_per_split ])\n",
    "        \n",
    "    list_of_split_lists = [l for l in list_of_split_lists if l  ]\n",
    "\n",
    "    return list_of_split_lists"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Main Calculations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate_data(params, split_num):     \n",
    "    \n",
    "    model = params[0]\n",
    "    dataset = params[1]\n",
    "    training_size = params[2]\n",
    "\n",
    "    data = []\n",
    "    print(f\"Running Model: {model}; dataset = {dataset}, perc = {training_size} on split {split_num}\")\n",
    "\n",
    "    ## original data \n",
    "    fname = f'{orig_data_dir + dataset}_subsampled_train_perc_{training_size}.npz'\n",
    "    loaded = np.load(fname)\n",
    "    ori_data = loaded['data']            \n",
    "\n",
    "    ## scale orig \n",
    "    scaler_orig = MinMaxScaler( )  \n",
    "    scaled_ori_data = scaler_orig.fit_transform(ori_data)\n",
    "\n",
    "    sample_file_name = gen_data_dir + f'{model}/{model}_gen_samples_{dataset}_perc_{training_size}.npz'\n",
    "        \n",
    "    if not os.path.isfile(sample_file_name): return\n",
    "\n",
    "    loaded = np.load(sample_file_name)\n",
    "    gen_data = loaded['data']     \n",
    "\n",
    "    # load and scale generated data \n",
    "    if model == 'vae_conv_I': \n",
    "        scaled_gen_data = scaler_orig.transform(gen_data)     \n",
    "    else: \n",
    "        scaled_gen_data = gen_data\n",
    "    \n",
    "#     print(scaled_ori_data.shape, scaled_gen_data.shape); return\n",
    "\n",
    "    # ---------------------------------------------------------------------------\n",
    "    # print(\"-\"*90); print('Visualizations:')\n",
    "    # visualization(scaled_ori_data[0:scaled_gen_data.shape[0]], scaled_gen_data, 'pca')\n",
    "    # visualization(scaled_ori_data[0:scaled_gen_data.shape[0]], scaled_gen_data, 'tsne')\n",
    "\n",
    "    # ---------------------------------------------------------------------------\n",
    "    \n",
    "    predictive_score, discriminative_score = [], []\n",
    "    for tt in range(metric_iteration):\n",
    "        temp_pred = predictive_score_metrics(scaled_ori_data, scaled_gen_data, \n",
    "                                             predictor = 'conv', # conv, rnn, nbeats\n",
    "                                             epochs = pred_epochs, print_epochs = print_period)\n",
    "        predictive_score.append(temp_pred)  \n",
    "\n",
    "        temp_disc = discriminative_score_metrics(scaled_ori_data, scaled_gen_data, print_epochs=print_period)\n",
    "        discriminative_score.append(temp_disc)   \n",
    "        print(tt, model, dataset, training_size, temp_pred, temp_disc)  \n",
    "\n",
    "    \n",
    "    pred_mean = np.round(np.mean(predictive_score), 4)\n",
    "    pred_CI = np.round(confidence_interval(predictive_score)[1], 4)\n",
    "    \n",
    "    disc_mean = np.round(np.mean(discriminative_score), 4)\n",
    "    disc_CI = np.round(confidence_interval(discriminative_score)[1], 4)\n",
    "\n",
    "    \n",
    "    print(f\"***Split/Model/Data/Perc : {split_num}/{model}/{dataset}/{training_size} Scores:\", \n",
    "          pred_mean, \"+/-\", pred_CI, disc_mean, \"+/-\", disc_CI)\n",
    "\n",
    "    #     ---------------------------------------------------------------------------\n",
    "    # save pred results\n",
    "    data =  [[  model,  dataset,   training_size,   metric_iteration,  \n",
    "                  pred_epochs, pred_mean,  pred_CI  ]]\n",
    "    cols = [ 'model', 'dataset', 'train_perc', 'iters', 'epochs', 'mean', 'conf_int']\n",
    "    df = pd.DataFrame(data, columns = cols)\n",
    "    df.insert(0, 'metric', 'pred_score')\n",
    "    df.to_csv(f\"./{scores_dir}/{model}/{model}_pred_scores_{dataset}_{training_size}.csv\", index=False,\n",
    "              float_format='%.4f')\n",
    "    \n",
    "    # save disc results\n",
    "    data =  [[  model,  dataset,   training_size,   metric_iteration,  \n",
    "                  disc_epochs, disc_mean,  disc_CI  ]]\n",
    "    cols = [ 'model', 'dataset', 'train_perc', 'iters', 'epochs' , 'mean', 'conf_int' ]\n",
    "    df = pd.DataFrame(data, columns = cols)\n",
    "    df.insert(0, 'metric', 'disc_score')\n",
    "    df.to_csv(f\"./{scores_dir}/{model}/{model}_disc_scores_{dataset}_{training_size}.csv\", index=False,\n",
    "              float_format='%.4f')\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate_all_data(params_sublist, split_num): \n",
    "    num = len(params_sublist)\n",
    "    for i, params in enumerate(params_sublist): \n",
    "        evaluate_data(params, split_num)\n",
    "        print(f\"Completed {i+i} of {num} on {split_num}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "start = time.time()\n",
    "\n",
    "metric_iteration = 5\n",
    "\n",
    "pred_epochs = 500\n",
    "disc_epochs = 500 \n",
    "\n",
    "print_period = 100\n",
    "\n",
    "# full selection of data to run\n",
    "models = ['timeVAE', 'rcgan', 'T_forcing', 'timegan']\n",
    "training_sizes = [2, 5, 10, 20, 100]\n",
    "datasets = ['stockv', 'air', 'sine', 'energy']\n",
    "\n",
    "\n",
    "### custom selection \n",
    "models = ['timeVAE']\n",
    "datasets = [ 'stocksv' ]\n",
    "# training_sizes = [ 2, 5, 10 ]\n",
    "\n",
    "\n",
    "params_list = [ [model, data_name, p ] for model in models for p in training_sizes  for data_name in datasets ]\n",
    "\n",
    "# Get cpu_count and use all but one for resource calculations\n",
    "num_cpus_to_use = multiprocessing.cpu_count() - 2\n",
    "if num_cpus_to_use > 8: num_cpus_to_use = 8\n",
    "if len(params_list) < num_cpus_to_use: num_cpus_to_use = len(params_list)\n",
    "# num_cpus_to_use = 1\n",
    "\n",
    "\n",
    "if num_cpus_to_use == 1: \n",
    "    evaluate_all_data(params_list, 0)    \n",
    "else: \n",
    "#     random.shuffle(params_list)\n",
    "    \n",
    "    split_params_lists = split_list_into_lists(params_list, num_cpus_to_use)\n",
    "    num_cpus_to_use = len(split_params_lists)\n",
    "    print(f\"Using {num_cpus_to_use} CPUs\")\n",
    "    \n",
    "    pool = multiprocessing.Pool(num_cpus_to_use)    \n",
    "#     print(split_params_lists); sys.exit()\n",
    "\n",
    "    # run forecasts on each thread\n",
    "    for split_num in range(num_cpus_to_use):\n",
    "        \n",
    "        pool.apply_async(evaluate_all_data, \n",
    "        args=( split_params_lists[split_num], split_num) )\n",
    "\n",
    "    pool.close()\n",
    "    pool.join() \n",
    "    \n",
    "\n",
    "end = time.time()\n",
    "print(f\"Total run time: {np.round((end - start)/60.0, 2)} minutes\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "instance_type": "ml.t3.medium",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
