{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, warnings\n",
    "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}\n",
    "warnings.filterwarnings('ignore') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from random import shuffle\n",
    "import sys, os\n",
    "from datetime import datetime, timedelta\n",
    "import numpy as np , pandas as pd\n",
    "import time\n",
    "import joblib\n",
    "import random\n",
    "\n",
    "import multiprocessing\n",
    "\n",
    "import tensorflow as tf\n",
    "from tensorflow.keras.optimizers import Adam\n",
    "from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback\n",
    "from timeVAE import TimeVAE\n",
    "from config import config as cfg\n",
    "import utils\n",
    "import math"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Define paths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_dir = \"../../data/processed_orig_data/\"\n",
    "output_dir = \"../../data/generated_data/\"\n",
    "model_dir = './model/'\n",
    "log_dir = './log/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Utility Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set seed for reproducibility\n",
    "def set_seeds(seed_value):   \n",
    "    os.environ['PYTHONHASHSEED']=str(seed_value)\n",
    "    random.seed(seed_value)\n",
    "    np.random.seed(seed_value)\n",
    "    tf.random.set_seed(seed_value)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_train_valid_split(data, valid_perc):\n",
    "    N = data.shape[0]\n",
    "    N_train = int(N * (1 - valid_perc))\n",
    "    N_valid = N - N_train\n",
    "\n",
    "    # shuffle data, just in case\n",
    "    np.random.shuffle(data)\n",
    "\n",
    "    # train, valid split \n",
    "    train_data = data[:N_train]\n",
    "    valid_data = data[N_train:]\n",
    "    return train_data, valid_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def scale_train_valid_data(train_data, valid_data): \n",
    "    \n",
    "    _, T, D = train_data.shape\n",
    "    \n",
    "    scaler = utils.MinMaxScaler_Feat_Dim( scaling_len = T, input_dim = D, upper_bound = 3.0, lower_bound = -3.0 )        \n",
    "    scaled_train_data = scaler.fit_transform(train_data)\n",
    "    scaled_valid_data = scaler.transform(valid_data)\n",
    "    \n",
    "    return scaled_train_data, scaled_valid_data, scaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class MinMaxScaler():\n",
    "    \"\"\"Min Max normalizer.\n",
    "    Args:\n",
    "    - data: original data\n",
    "\n",
    "    Returns:\n",
    "    - norm_data: normalized data\n",
    "    \"\"\"\n",
    "    def fit_transform(self, data): \n",
    "        self.fit(data)\n",
    "        scaled_data = self.transform(data)\n",
    "        return scaled_data\n",
    "\n",
    "\n",
    "    def fit(self, data):    \n",
    "        self.mini = np.min(data, 0)\n",
    "        self.range = np.max(data, 0) - self.mini\n",
    "        return self\n",
    "        \n",
    "\n",
    "    def transform(self, data):\n",
    "        numerator = data - self.mini\n",
    "        scaled_data = numerator / (self.range + 1e-7)\n",
    "        return scaled_data\n",
    "\n",
    "    \n",
    "    def inverse_transform(self, data):\n",
    "        data *= self.range\n",
    "        data += self.mini\n",
    "        return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_list_into_lists(list_of_items, num_splits):\n",
    "    if len(list_of_items) % num_splits == 0: \n",
    "        num_per_split = (len(list_of_items) // num_splits) \n",
    "    else: \n",
    "        num_per_split = (len(list_of_items) // num_splits) + 1\n",
    "\n",
    "    list_of_split_lists = []\n",
    "    for i in range(num_splits):\n",
    "        list_of_split_lists.append(list_of_items[i * num_per_split : (i + 1) * num_per_split ])\n",
    "        \n",
    "    list_of_split_lists = [l for l in list_of_split_lists if l  ]\n",
    "\n",
    "    return list_of_split_lists"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Main VAE Loop"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class PrintLossPerNthEpoch(Callback):\n",
    "    def __init__(self, label, print_period, *args, **kwargs):\n",
    "        super().__init__(*args, **kwargs)\n",
    "        self.label = label\n",
    "        self.print_period = print_period\n",
    "\n",
    "    def on_epoch_end(self, epoch, logs=None):\n",
    "        if epoch % self.print_period == (self.print_period - 1):\n",
    "            try:  \n",
    "                loss = np.round(logs['loss'], 3); val_loss = np.round(logs['val_loss'], 3)\n",
    "                print( f\"{self.label} Avg. train / val loss for epoch {epoch+1}: {loss} / {val_loss} \" )\n",
    "            except: \n",
    "                loss = np.round(logs['loss'], 3)\n",
    "                print( f\"{self.label} Avg. train loss for epoch {epoch+1}: {loss} \" )\n",
    "        else: \n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_model(model, scaled_train_data, latent_dim, hidden_layer_sizes, reconstruction_wt, epochs = 100):\n",
    "    \n",
    "    set_seeds(0)\n",
    "    _, T, D = scaled_train_data.shape\n",
    "    # ----------------------------------------------------------------------------------------------\n",
    "    # Instantiate the VAE\n",
    "    vae = TimeVAE( \n",
    "        seq_len=T,  feat_dim = D, \n",
    "        latent_dim = latent_dim, \n",
    "        hidden_layer_sizes=hidden_layer_sizes,        \n",
    "            reconstruction_wt = reconstruction_wt,            \n",
    "            use_residual_conn = True, \n",
    "            )         \n",
    "\n",
    "    vae.compile(optimizer=Adam())\n",
    "    # vae.summary() ; sys.exit()\n",
    "    \n",
    "    \n",
    "    # ----------------------------------------------------------------------------------------------\n",
    "    # Train the VAE\n",
    "    early_stop_loss = 'loss'\n",
    "    early_stop_callback = EarlyStopping(monitor=early_stop_loss, min_delta = 1e-1, patience=50) \n",
    "    reduceLR = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10)\n",
    "    printCB = PrintLossPerNthEpoch(label='TimeVAE', print_period=50)\n",
    "    \n",
    "    history = vae.fit(\n",
    "        scaled_train_data, \n",
    "        batch_size = 32,\n",
    "        epochs=epochs,\n",
    "        shuffle = True,\n",
    "        callbacks=[early_stop_callback, reduceLR, printCB],\n",
    "        verbose = 0\n",
    "    )\n",
    "    # ----------------------------------------------------------------------------------------------\n",
    "    return vae, history"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_vae_data(params, split_num): \n",
    "    # set random gen seed for reproducibiity\n",
    "    set_seeds(42)\n",
    "    \n",
    "    data_name = params[0]\n",
    "    p = params[1]\n",
    "    \n",
    "    \n",
    "    # file name to load\n",
    "    fname = f'{input_dir + data_name}_subsampled_train_perc_{p}.npz'\n",
    "\n",
    "    # read data        \n",
    "    loaded = np.load(fname)\n",
    "    ori_data = loaded['data']       \n",
    "    N, T, D = ori_data.shape     \n",
    "    print(fname, ori_data.shape)    \n",
    "\n",
    "    scaler = MinMaxScaler()\n",
    "    scaled_ori_data = scaler.fit_transform(ori_data)\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    # hyper-parameters\n",
    "    latent_dim = hyper_params[data_name]['latent_dim']\n",
    "    hidden_layer_sizes = hyper_params[data_name]['hidden_layer_sizes']\n",
    "    reconstruction_wt = hyper_params[data_name]['reconstruction_wt']\n",
    "    print(latent_dim, hidden_layer_sizes, reconstruction_wt) #; sys.exit()\n",
    "    # -------------------------------------------------------------------------\n",
    "\n",
    "    training_times = []\n",
    "    for t in range(num_iters):\n",
    "        print(\"-\"*60)\n",
    "        print(f\"Running dataset = {data_name}, perc = {p}, iter = {t} on split {split_num}\")\n",
    "\n",
    "        # start timer\n",
    "        start = time.time() \n",
    "        vae, history = train_model(model, scaled_ori_data, \n",
    "                   latent_dim, \n",
    "                   hidden_layer_sizes, \n",
    "                   reconstruction_wt = reconstruction_wt, \n",
    "                   epochs = 2000)       \n",
    "\n",
    "        # stop timer and log training time \n",
    "        end = time.time()\n",
    "        train_time = np.round((end - start)/60.0, 2)\n",
    "\n",
    "        training_times.append({\n",
    "            'model': model, 'data': data_name,  'perc': p, 'iter': t, \n",
    "            'latent_dim': latent_dim,\n",
    "            'loss': np.round(history.history['loss'][-1], 3), \n",
    "            'reconst_loss': np.round(history.history['reconstruction_loss'][-1],3), \n",
    "            'kl_loss': np.round(history.history['kl_loss'][-1], 3), \n",
    "            'train_time_in_min': train_time,                \n",
    "        })            \n",
    "        # ----------------------------------------------------------------------------------------------\n",
    "        # Save the model \n",
    "        model_name_pref = f'{model}_{data_name}_perc_{p}_iter_{t}_'\n",
    "        vae.save(model_dir, model_name_pref)  \n",
    "    \n",
    "    \n",
    "    # ----------------------------------------------------------------------------------------------        \n",
    "    # Generate samples   \n",
    "    # We will save samples from the last iteration \n",
    "#         samples = vae.get_prior_samples(num_samples=int(N * (100 / p)))\n",
    "    scaled_samples = vae.get_prior_samples(num_samples= ori_data.shape[0])\n",
    "    #print(\"gen sample size: \", scaled_samples.shape)\n",
    "\n",
    "    # inverse transform using scaler \n",
    "    samples = scaler.inverse_transform(scaled_samples)        \n",
    "\n",
    "    # save to output dir\n",
    "    samples_fpath = f'{model}/{model}_gen_samples_{data_name}_perc_{p}.npz'        \n",
    "    np.savez_compressed(os.path.join( output_dir, samples_fpath), data=samples)\n",
    "    # ----------------------------------------------------------------------------------------------        \n",
    "    # log training times for the iterations\n",
    "    log_df = pd.DataFrame.from_dict(training_times)\n",
    "    print(log_df)\n",
    "    log_file = f'{model}_{data_name}_perc_{p}_train_log.csv'\n",
    "    log_df.to_csv(log_dir + log_file, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_all_data(params_sublist, split_num): \n",
    "    num = len(params_sublist)\n",
    "    for i, params in enumerate(params_sublist): \n",
    "        generate_vae_data(params, split_num)\n",
    "        print(f\"completed {i+i} of {num}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "l1_mult, l2_mult, l3_mult = 25, 25, 50\n",
    "hyper_params = {\n",
    "    'stocksv':  { \n",
    "        'latent_dim': 2 , \n",
    "        'hidden_layer_sizes': [2 * l1_mult, 4 * l2_mult, 4 * l3_mult],\n",
    "        'reconstruction_wt': 2.5,   # 1.96\n",
    "    },\n",
    "    'air':      { \n",
    "        'latent_dim': 4 , \n",
    "        'hidden_layer_sizes': [2 * l1_mult, 4 * l2_mult, 4 * l3_mult],\n",
    "        'reconstruction_wt': 2.5,   \n",
    "    },\n",
    "    'sine':     { \n",
    "        'latent_dim': 10 , \n",
    "        'hidden_layer_sizes': [2 * l1_mult, 4 * l2_mult, 4 * l3_mult],\n",
    "        'reconstruction_wt': 2.93,\n",
    "    },\n",
    "    'energy':   { \n",
    "        'latent_dim': 10 , \n",
    "        'hidden_layer_sizes': [2 * l1_mult, 4 * l2_mult, 4 * l3_mult],\n",
    "        'reconstruction_wt': 0.8,\n",
    "    },\n",
    "}\n",
    "hyper_params['stocksv']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "main_start_time = time.time() \n",
    "\n",
    "\n",
    "# how many times to run each scenario\n",
    "num_iters = 5\n",
    "\n",
    "# set 10% off for validation during VAE model development; then set to 0 for final data generation\n",
    "valid_perc = 0.0\n",
    "\n",
    "# our model name\n",
    "model = 'timeVAE'        \n",
    "\n",
    "dataset_names = ['stocksv', 'air', 'sine', 'energy']\n",
    "percs = [2, 5, 10, 20, 100]\n",
    "\n",
    "\n",
    "# to custom run specific data\n",
    "dataset_names = [ 'stocksv']\n",
    "percs = [ 2 ]\n",
    "\n",
    "\n",
    "params_list = [ [data_name, p ] for data_name in dataset_names for p in percs  ]\n",
    "\n",
    "# Get cpu_count and use all but one for resource calculations\n",
    "num_cpus_to_use = multiprocessing.cpu_count() - 2\n",
    "if num_cpus_to_use > 8: num_cpus_to_use = 8\n",
    "if len(params_list) < num_cpus_to_use: num_cpus_to_use = len(params_list)\n",
    "    \n",
    "num_cpus_to_use = 1\n",
    "print(f\"Using {num_cpus_to_use} CPUs\")\n",
    "\n",
    "# print(len(params_list))\n",
    "\n",
    "if num_cpus_to_use == 1: \n",
    "    generate_all_data(params_list, 0)    \n",
    "else: \n",
    "    random.shuffle(params_list)\n",
    "    \n",
    "    split_params_lists = split_list_into_lists(params_list, num_cpus_to_use)\n",
    "    num_cpus_to_use = len(split_params_lists)    \n",
    "    pool = multiprocessing.Pool(num_cpus_to_use)\n",
    "\n",
    "    # run forecasts on each thread\n",
    "    for split_num in range(num_cpus_to_use):\n",
    "        \n",
    "        pool.apply_async(generate_all_data, \n",
    "        args=( split_params_lists[split_num], split_num) )\n",
    "\n",
    "    pool.close()\n",
    "    pool.join() \n",
    "\n",
    "\n",
    "end = time.time()\n",
    "elapsed_time = np.round((end - main_start_time)/60.0, 2)\n",
    "print(f\"All done in {elapsed_time} minutes!\")  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load generated data for Inspection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data = 'sine'\n",
    "test_perc = 10\n",
    "\n",
    "samples_fpath = f'{model}/{model}_gen_samples_{test_data}_perc_{test_perc}.npz'     \n",
    "loaded = np.load(os.path.join( output_dir, samples_fpath))\n",
    "gen_data = loaded['data']\n",
    "print(gen_data.shape)\n",
    "\n",
    "print(\"generated mean : \", gen_data.mean(axis=0).mean(axis=0)) \n",
    "\n",
    "utils.plot_samples(gen_data, n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
