{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Python packages used in this code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib\n",
    "import matplotlib.pyplot as plt\n",
    "import random\n",
    "import os\n",
    "import pickle\n",
    "import time\n",
    "import sklearn\n",
    "import platform\n",
    "import sys\n",
    "from sklearn.base import BaseEstimator, RegressorMixin\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
    "from sklearn.linear_model import Ridge, LinearRegression\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
    "import joblib\n",
    "import itertools\n",
    "from IPython.display import clear_output\n",
    "\n",
    "## Keras\n",
    "import tensorflow\n",
    "from tensorflow import keras\n",
    "from tensorflow.keras.models import Sequential, Model, model_from_json\n",
    "from tensorflow.keras.layers import Dense, Input, Add, Lambda, Dropout, Subtract, Multiply, Concatenate, Dot, BatchNormalization, Activation, LeakyReLU, ReLU\n",
    "from tensorflow.keras.losses import mse\n",
    "import keras.backend as K\n",
    "from tensorflow.keras import regularizers\n",
    "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'\n",
    "os.environ['TF_CPP_MIN_LOG_LEVEL']='2'\n",
    "tensorflow.get_logger().setLevel(\"ERROR\")\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Environments\n",
    "\n",
    "--Platform--\n",
    "OS : Windows-10-10.0.19044-SP0\n",
    "--Version--\n",
    "python :  3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]\n",
    "numpy : 1.23.1\n",
    "pandas : 1.4.3\n",
    "sklearn : 1.1.1\n",
    "tensorflow : 2.9.1\n",
    "keras : 2.9.0\n",
    "\"\"\"\n",
    "\n",
    "print('--Platform--')\n",
    "print('OS :', platform.platform())\n",
    "print('--Version--')\n",
    "print('python : ', sys.version)\n",
    "print('numpy :', np.__version__)\n",
    "print('pandas :', pd.__version__)\n",
    "print('sklearn :', sklearn.__version__)\n",
    "print('tensorflow :', tensorflow.__version__)\n",
    "print('keras :', keras.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preparation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## fix_seed function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fix_seed(seed):\n",
    "    # Numpy\n",
    "    np.random.seed(seed)\n",
    "    # Tensorflow\n",
    "    tensorflow.random.set_seed(seed)\n",
    "    # for built-in random\n",
    "    random.seed(seed)\n",
    "    # for hash seed\n",
    "    os.environ[\"PYTHONHASHSEED\"] = str(seed)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## PrintDot function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class PrintDot(keras.callbacks.Callback):\n",
    "    def on_epoch_end(self, epoch, logs):\n",
    "        if epoch%100==0: print('')\n",
    "        if epoch%100==1: print(epoch)\n",
    "        print('.', end='')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save directory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.isdir('../30_Output/10_Model/100_MakeSourceModel'):\n",
    "    os.makedirs('../30_Output/10_Model/100_MakeSourceModel')\n",
    "if not os.path.isdir('../30_Output/20_Plot/100_MakeSourceModel'):\n",
    "    os.makedirs('../30_Output/20_Plot/100_MakeSourceModel')\n",
    "if not os.path.isdir('../30_Output/30_csv/100_MakeSourceModel'):\n",
    "    os.makedirs('../30_Output/30_csv/100_MakeSourceModel')\n",
    "if not os.path.isdir('../30_Output/40_pkl/100_MakeSourceModel'):\n",
    "    os.makedirs('../30_Output/40_pkl/100_MakeSourceModel')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dataset\n",
    "data_all = joblib.load('../10_Data/SPSTC_290.pkl')\n",
    "drop_list = ['min:gs_mag_moment','min:num_d_unfilled','min:num_f_unfilled','min:num_f_valence','ave:num_f_unfilled','ave:num_f_valence','sum:num_f_unfilled','sum:num_f_valence','var:num_f_unfilled','var:num_f_valence','max:num_f_unfilled','max:num_f_valence']\n",
    "\n",
    "x_all = data_all['desc']\n",
    "x_all = x_all.drop(drop_list,axis=1)\n",
    "y_all = data_all['data']['SPS (cm)']\n",
    "\n",
    "## Scaling parameters\n",
    "x_mean = x_all.mean()\n",
    "x_std = x_all.std()\n",
    "y_logmean = np.log(y_all).mean()\n",
    "y_logstd = np.log(y_all).std()\n",
    "\n",
    "dim_x = 290 - len(drop_list)\n",
    "early_stop = keras.callbacks.EarlyStopping(monitor='val_mse', patience=50)\n",
    "i_list = np.arange(0,100,1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Main codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Saving dataframe\n",
    "df_result = pd.DataFrame(np.zeros([0,328]), columns=['Itr','layers','lambda', 'dropout rate','learning rate', 'MSE', 'Corr', 'MAE']+x_all.index.values.tolist())\n",
    "\n",
    "# Loop 100 times\n",
    "for i in i_list:\n",
    "    rnd1 = i\n",
    "    rnd2 = i*2\n",
    "    print('try : '+str(i)+' ('+str(rnd1)+', '+str(rnd2)+')')\n",
    "    fix_seed(373)\n",
    "    x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, train_size=256, random_state=int(rnd1))\n",
    "    \n",
    "    ## Scaling\n",
    "    x_train_scal = (x_train - x_mean)/x_std\n",
    "    x_test_scal = (x_test - x_mean)/x_std\n",
    "    y_train_scal = (np.log(y_train)-y_logmean)/y_logstd\n",
    "\n",
    "    fix_seed(int(rnd1))\n",
    "    num_layers = 3\n",
    "    width_layers = np.zeros(num_layers)+np.random.randint(50,100)\n",
    "\n",
    "    ## Model definision\n",
    "    lmbd = 1e-2\n",
    "    def make_model():\n",
    "        model_input = Input(shape=(dim_x,))\n",
    "        h = model_input\n",
    "        for i_model in range(num_layers):\n",
    "            h = Dense(units=width_layers[i_model], kernel_regularizer=regularizers.L2(lmbd))(h)\n",
    "            h = LeakyReLU(alpha=0.01)(h)\n",
    "            \n",
    "        out = Dense(units=1, kernel_regularizer=regularizers.L2(lmbd))(h) \n",
    "        model = Model(model_input, out)\n",
    "        optimizer = keras.optimizers.Adam(1e-4)\n",
    "        model.compile(loss='mse',\n",
    "                     optimizer=optimizer,\n",
    "                     metrics=['mae', 'mse'])\n",
    "        return(model)\n",
    "    model = make_model()\n",
    "\n",
    "    ## Model training\n",
    "    fix_seed(int(rnd1))\n",
    "    history = model.fit(\n",
    "        x_train_scal,\n",
    "        y_train_scal,\n",
    "        batch_size=8,\n",
    "        epochs=1000,\n",
    "        validation_split = 0.2,\n",
    "        verbose=0,\n",
    "        callbacks=[early_stop, PrintDot()])\n",
    "\n",
    "    text_layers = '-'.join(map(str, map(int,np.append(np.append(np.array([dim_x]),width_layers),np.array([1]) ))))\n",
    "\n",
    "    y_fits_scal = model.predict(x_train_scal)\n",
    "    y_fits = np.exp(y_fits_scal * y_logstd + y_logmean)\n",
    "    y_pred_scal = model.predict(x_test_scal)\n",
    "    y_pred = np.exp(y_pred_scal * y_logstd + y_logmean)\n",
    "    \n",
    "    ## Save\n",
    "    ### Plot\n",
    "    y_obs1 = y_train\n",
    "    y_prd1 = y_fits.reshape(-1)\n",
    "    y_obs2 = y_test\n",
    "    y_prd2 = y_pred.reshape(-1)\n",
    "    fig = plt.figure(figsize=(5,5))\n",
    "    plt.scatter(y_obs1, y_prd1, alpha=0.7, zorder=2, s=50, label='Train')\n",
    "    plt.scatter(y_obs2, y_prd2, alpha=0.7, zorder=3, s=50, color='darkorange', label='Test') #darkorange\n",
    "    plt.title('Model '+str(i)+' : '+text_layers, size=15)\n",
    "    plt.legend(loc='lower right')\n",
    "    plt.xlabel('Observation', size=10)\n",
    "    plt.ylabel('Prediction', size=10)\n",
    "    plt.axis('equal')\n",
    "    plt.axis('square')\n",
    "    plt.grid(color='gray', linestyle='dotted', linewidth=1, alpha=0.5)\n",
    "    fig.text(0.15, 0.83, 'Corr : '+str(round(np.corrcoef(y_prd2.reshape(y_prd2.shape[0]), y_obs2)[0,1], 4)), size=15)\n",
    "    fig.text(0.15, 0.78, 'MSE : '+str(round(np.mean((y_obs2-y_prd2)**2), 4)), size=15)\n",
    "    fig.text(0.15, 0.73, 'MAE : '+str(round(np.mean(np.abs(y_obs2-y_prd2)), 4)), size=15)\n",
    "    plt.xlim([min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1])])\n",
    "    plt.ylim([min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1])])\n",
    "    _ = plt.plot([-300, 300], [-300, 300], color='gray', linewidth=0.5, zorder=1)\n",
    "    fig.savefig('../30_Output/20_Plot/100_MakeSourceModel/100_Model_'+str(i)+'.png')\n",
    "    plt.close(fig)\n",
    "\n",
    "    ### Dataframe\n",
    "    dr_rate = 0\n",
    "    learning_rate = 1e-4\n",
    "    result_mse = np.mean((y_obs2-y_prd2)**2)\n",
    "    result_corr = np.corrcoef(y_prd2.reshape(y_prd2.shape[0]), y_obs2.values)[0,1]\n",
    "    result_mae = np.mean(np.abs(y_obs2-y_prd2))\n",
    "    tmp_df = pd.DataFrame(np.ones([1,256]), columns=x_train_scal.index.values.tolist(), index=['Model '+str(i)])\n",
    "    tmp_df2 = pd.DataFrame(np.array([i, text_layers, lmbd, dr_rate, learning_rate, result_mse, result_corr, result_mae]).reshape(1, -1), columns=['Itr','layers', 'lambda', 'dropout rate','learning rate', 'MSE', 'Corr', 'MAE'], index=['Model '+str(i)])\n",
    "    df_result = pd.concat([df_result, pd.concat([tmp_df2, tmp_df], axis=1)], axis=0)\n",
    "    df_result.to_csv('../30_Output/30_csv/100_MakeSourceModel/100_Result.csv')\n",
    "\n",
    "    ### Pickle\n",
    "    result_list = {'x_train': x_train,\n",
    "                  'x_test' : x_test,\n",
    "                  'y_train' : y_train,\n",
    "                  'y_test' : y_test,\n",
    "                  'x_mean' : x_mean,\n",
    "                  'x_std' : x_std,\n",
    "                  'text_layers' : text_layers,\n",
    "                  'width_layers' : width_layers,\n",
    "                  'y_logmean' : y_logmean,\n",
    "                  'y_logstd': y_logstd}\n",
    "    json_string = model.to_json()\n",
    "    open('../30_Output/10_Model/100_MakeSourceModel/100_Model_'+str(i)+'.json', 'w').write(json_string)\n",
    "    model.save_weights('../30_Output/10_Model/100_MakeSourceModel/100_Model_'+str(i)+'.hdf5')\n",
    "    f = open('../30_Output/40_pkl/100_MakeSourceModel/100_Model_'+str(i)+'.pkl','wb')\n",
    "    pickle.dump(result_list,f)\n",
    "    f.close\n",
    "    \n",
    "    K.clear_session()\n",
    "    del model\n",
    "    clear_output(True)\n",
    "    \n",
    "clear_output(True)\n",
    "print('*** Succeeded ***')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
