{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "741992ed-9816-4664-8342-e877246d8a89",
   "metadata": {},
   "source": [
    "### Python packages used in this code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c505020-4919-4226-9bc1-47f81ec3b9ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "%run Setups.py\n",
    "%run Setups_tf.py\n",
    "%matplotlib inline\n",
    "\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
    "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'\n",
    "os.environ['TF_CPP_MIN_LOG_LEVEL']='2'\n",
    "tensorflow.get_logger().setLevel(\"ERROR\")\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.metrics import confusion_matrix, accuracy_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "688e8689-5654-467b-98ee-74edc8e980b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Environments\n",
    "\n",
    "--Platform--\n",
    "OS : Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.35\n",
    "--Version--\n",
    "python :  3.9.16 (main, Mar  8 2023, 14:00:05) \n",
    "[GCC 11.2.0]\n",
    "numpy : 1.23.5\n",
    "pandas : 2.0.0\n",
    "tensorflow : 2.12.0\n",
    "sklearn : 2.12.0\n",
    "\"\"\"\n",
    "\n",
    "print('--Platform--')\n",
    "print('OS :', platform.platform())\n",
    "print('--Version--')\n",
    "print('python : ', sys.version)\n",
    "print('numpy :', np.__version__)\n",
    "print('pandas :', pd.__version__)\n",
    "print('tensorflow :', tf.__version__)\n",
    "print('sklearn :', tf.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "53635dea-651e-420b-9fd3-35f653035cd5",
   "metadata": {},
   "source": [
    "### Arguments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d7520dee-06c8-42ff-8907-22b7ae19e7a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "project_name = '200_FeatureExtraction'\n",
    "out_dir = '../30_Output/'+project_name+'/'\n",
    "seed = 373"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3f2d4c86-fe20-442d-90cd-e168ce8ae6de",
   "metadata": {},
   "source": [
    "### Make directories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f0566df-2dbf-4c5a-8580-d88fad059f81",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.isdir(out_dir):\n",
    "    os.makedirs(out_dir)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7e83cd67-7830-4eff-a9d6-df78b08e25df",
   "metadata": {},
   "source": [
    "### Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c47e340-e3e7-455f-988f-355d986bcb0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_model_reg(hp):\n",
    "    fix_seed(seed)\n",
    "    num_layers128 = hp.Int('num_layers128', 0, 1)\n",
    "    num_layers64 = hp.Int('num_layers64', 0, 1)\n",
    "    num_layers32 = hp.Int('num_layers32', 0, 2)\n",
    "    num_layers16 = hp.Int('num_layers16', 0, 2)\n",
    "    \n",
    "    model = keras.Sequential()\n",
    "    if num_layers128 > 0:\n",
    "        for i in range(num_layers128):\n",
    "            model.add(Dense(\n",
    "                units=128,\n",
    "                kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                bias_initializer=keras.initializers.Zeros()))\n",
    "            model.add(Activation('relu'))\n",
    "            model.add(Dropout(0.1))\n",
    "    if num_layers64 > 0:\n",
    "        for i in range(num_layers64):\n",
    "            model.add(Dense(\n",
    "                units=64,\n",
    "                kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                bias_initializer=keras.initializers.Zeros()))\n",
    "            model.add(Activation('relu'))\n",
    "            model.add(Dropout(0.1))\n",
    "    if num_layers32 > 0:\n",
    "        for i in range(num_layers32):\n",
    "            model.add(Dense(\n",
    "                units=32,\n",
    "                kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                bias_initializer=keras.initializers.Zeros()))\n",
    "            model.add(Activation('relu'))\n",
    "            model.add(Dropout(0.1))\n",
    "    if num_layers16 > 0:\n",
    "        for i in range(num_layers16):\n",
    "            model.add(Dense(\n",
    "                units=16,\n",
    "                kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                bias_initializer=keras.initializers.Zeros()))\n",
    "            model.add(Activation('relu'))\n",
    "            model.add(Dropout(0.1))\n",
    "    model.add(Dense(1, kernel_initializer=keras.initializers.glorot_uniform(seed=seed),bias_initializer=keras.initializers.Zeros()))\n",
    "    model.compile(\n",
    "        # optimizer=keras.optimizers.Adam(0.01),#grad(learning_rate=0.01, epsilon=1e-7),#, decay=0.0),\n",
    "        optimizer=keras.optimizers.Adagrad(learning_rate=0.01, epsilon=1e-7),\n",
    "        loss='mse',\n",
    "        metrics=['mae', 'mse'])\n",
    "    return model\n",
    "\n",
    "def build_model_cls(hp):\n",
    "    fix_seed(seed)\n",
    "    num_layers128 = hp.Int('num_layers128', 0, 1)\n",
    "    num_layers64 = hp.Int('num_layers64', 0, 1)\n",
    "    num_layers32 = hp.Int('num_layers32', 0, 2)\n",
    "    num_layers16 = hp.Int('num_layers16', 0, 2)\n",
    "    \n",
    "    model = keras.Sequential()\n",
    "    if num_layers128 > 0:\n",
    "        for i in range(num_layers128):\n",
    "            model.add(Dense(\n",
    "                units=128,\n",
    "                kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                bias_initializer=keras.initializers.Zeros()))\n",
    "            model.add(Activation('relu'))\n",
    "            model.add(Dropout(0.1))\n",
    "    if num_layers64 > 0:\n",
    "        for i in range(num_layers64):\n",
    "            model.add(Dense(\n",
    "                units=64,\n",
    "                kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                bias_initializer=keras.initializers.Zeros()))\n",
    "            model.add(Activation('relu'))\n",
    "            model.add(Dropout(0.1))\n",
    "    if num_layers32 > 0:\n",
    "        for i in range(num_layers32):\n",
    "            model.add(Dense(\n",
    "                units=32,\n",
    "                kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                bias_initializer=keras.initializers.Zeros()))\n",
    "            model.add(Activation('relu'))\n",
    "            model.add(Dropout(0.1))\n",
    "    if num_layers16 > 0:\n",
    "        for i in range(num_layers16):\n",
    "            model.add(Dense(\n",
    "                units=16,\n",
    "                kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                bias_initializer=keras.initializers.Zeros()))\n",
    "            model.add(Activation('relu'))\n",
    "            model.add(Dropout(0.1))\n",
    "    model.add(Dense(1, activation='sigmoid', kernel_initializer=keras.initializers.glorot_uniform(seed=seed),bias_initializer=keras.initializers.Zeros()))\n",
    "    model.compile(\n",
    "        # optimizer=keras.optimizers.Adam(0.01),#grad(learning_rate=0.01, epsilon=1e-7),#, decay=0.0),\n",
    "        optimizer=keras.optimizers.Adagrad(learning_rate=0.01, epsilon=1e-7),\n",
    "        loss='binary_crossentropy',\n",
    "        metrics=['accuracy']\n",
    "    )\n",
    "    return model\n",
    "\n",
    "def build_model_reg_Affine(hp):\n",
    "    fix_seed(seed)\n",
    "    num_layers128 = hp.Int('num_layers128', 0, 1)\n",
    "    num_layers64 = hp.Int('num_layers64', 0, 1)\n",
    "    num_layers32 = hp.Int('num_layers32', 0, 2)\n",
    "    num_layers16 = hp.Int('num_layers16', 0, 2)\n",
    "    lmbd = 0.001#.0001#10**hp.Float('L1 lambda power', min_value=-1, max_value=3)\n",
    "    \n",
    "    # Define input layers\n",
    "    input_x = keras.layers.Input(shape=(dim_x,), name='x')\n",
    "    input_f = keras.layers.Input(shape=(dim_f,), name='f')\n",
    "    \n",
    "    x = input_x\n",
    "    f = input_f\n",
    "\n",
    "    # Network for f\n",
    "    if num_layers128 > 0:\n",
    "        for i in range(num_layers128):\n",
    "            f = keras.layers.Dense(\n",
    "                    units=128,\n",
    "                    kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                    bias_initializer=keras.initializers.Zeros(), activation='relu')(f)\n",
    "            f = keras.layers.Dropout(0.1)(f)\n",
    "    if num_layers64 > 0:\n",
    "        for i in range(num_layers64):\n",
    "            f = keras.layers.Dense(\n",
    "                    units=64,\n",
    "                    kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                    bias_initializer=keras.initializers.Zeros(), activation='relu')(f)\n",
    "            f = keras.layers.Dropout(0.1)(f)\n",
    "    if num_layers32 > 0:\n",
    "        for i in range(num_layers32):\n",
    "            f = keras.layers.Dense(\n",
    "                    units=32,\n",
    "                    kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                    bias_initializer=keras.initializers.Zeros(), activation='relu')(f)\n",
    "            f = keras.layers.Dropout(0.1)(f)\n",
    "    if num_layers16 > 0:\n",
    "        for i in range(num_layers16):\n",
    "            f = keras.layers.Dense(\n",
    "                    units=16,\n",
    "                    kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                    bias_initializer=keras.initializers.Zeros(), activation='relu')(f)\n",
    "            f = keras.layers.Dropout(0.1)(f)\n",
    "    g1 = keras.layers.Dense(units=1, name='g1', kernel_initializer=keras.initializers.glorot_uniform(seed=seed), bias_initializer=keras.initializers.Zeros())(f)\n",
    "    g2 = keras.layers.Dense(units=1, activation='sigmoid',name='g2', kernel_initializer=keras.initializers.glorot_uniform(seed=seed), bias_initializer=keras.initializers.Zeros())(f)\n",
    "\n",
    "    # Network for x\n",
    "    g3 = keras.layers.Dense(units=1, name='g3', kernel_regularizer=regularizers.l1(lmbd), kernel_initializer=keras.initializers.glorot_uniform(seed=seed), bias_initializer=keras.initializers.Zeros())(input_x)\n",
    "    # g3 = keras.layers.Dense(units=1, name='g3', kernel_initializer=keras.initializers.glorot_uniform(seed=seed), bias_initializer=keras.initializers.Zeros())(input_x)\n",
    "\n",
    "    # Merge the networks\n",
    "    output = keras.layers.Add(name='output')([\n",
    "        g1,\n",
    "        keras.layers.Multiply(name='g2_g3')([g2, g3])\n",
    "    ])\n",
    "\n",
    "    # Define the model\n",
    "    model = keras.models.Model(inputs=[input_x, input_f], outputs=output)\n",
    "\n",
    "    model.compile(\n",
    "        # optimizer=keras.optimizers.Adam(0.01),#grad(learning_rate=0.01, epsilon=1e-7),\n",
    "        optimizer=keras.optimizers.Adagrad(learning_rate=0.01, epsilon=1e-7),\n",
    "        loss='mse',\n",
    "        metrics=['mae', 'mse']\n",
    "    )\n",
    "    \n",
    "    return model\n",
    "\n",
    "def build_model_cls_Affine(hp):\n",
    "    fix_seed(seed)\n",
    "    num_layers128 = hp.Int('num_layers128', 0, 1)\n",
    "    num_layers64 = hp.Int('num_layers64', 0, 1)\n",
    "    num_layers32 = hp.Int('num_layers32', 0, 2)\n",
    "    num_layers16 = hp.Int('num_layers16', 0, 2)\n",
    "    lmbd = 0.001#.0001#10**hp.Float('L1 lambda power', min_value=-1, max_value=3)\n",
    "    \n",
    "    # Define input layers\n",
    "    input_x = keras.layers.Input(shape=(dim_x,), name='x')\n",
    "    input_f = keras.layers.Input(shape=(dim_f,), name='f')\n",
    "    \n",
    "    x = input_x\n",
    "    f = input_f\n",
    "\n",
    "    # Network for f\n",
    "    if num_layers128 > 0:\n",
    "        for i in range(num_layers128):\n",
    "            f = keras.layers.Dense(\n",
    "                    units=128,\n",
    "                    kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                    bias_initializer=keras.initializers.Zeros(), activation='relu')(f)\n",
    "            f = keras.layers.Dropout(0.1)(f)\n",
    "    if num_layers64 > 0:\n",
    "        for i in range(num_layers64):\n",
    "            f = keras.layers.Dense(\n",
    "                    units=64,\n",
    "                    kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                    bias_initializer=keras.initializers.Zeros(), activation='relu')(f)\n",
    "            f = keras.layers.Dropout(0.1)(f)\n",
    "    if num_layers32 > 0:\n",
    "        for i in range(num_layers32):\n",
    "            f = keras.layers.Dense(\n",
    "                    units=32,\n",
    "                    kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                    bias_initializer=keras.initializers.Zeros(), activation='relu')(f)\n",
    "            f = keras.layers.Dropout(0.1)(f)\n",
    "    if num_layers16 > 0:\n",
    "        for i in range(num_layers16):\n",
    "            f = keras.layers.Dense(\n",
    "                    units=16,\n",
    "                    kernel_initializer=keras.initializers.glorot_uniform(seed=seed),\n",
    "                    bias_initializer=keras.initializers.Zeros(), activation='relu')(f)\n",
    "            f = keras.layers.Dropout(0.1)(f)\n",
    "    g1 = keras.layers.Dense(units=1, name='g1', kernel_initializer=keras.initializers.glorot_uniform(seed=seed), bias_initializer=keras.initializers.Zeros())(f)\n",
    "    g2 = keras.layers.Dense(units=1, name='g2', kernel_initializer=keras.initializers.glorot_uniform(seed=seed), bias_initializer=keras.initializers.Zeros())(f)\n",
    "\n",
    "    # Network for x\n",
    "    g3 = keras.layers.Dense(units=1, name='g3', kernel_regularizer=regularizers.l1(lmbd), kernel_initializer=keras.initializers.glorot_uniform(seed=seed), bias_initializer=keras.initializers.Zeros())(input_x)\n",
    "    # g3 = keras.layers.Dense(units=1, name='g3', kernel_initializer=keras.initializers.glorot_uniform(seed=seed), bias_initializer=keras.initializers.Zeros())(input_x)\n",
    "\n",
    "    # Merge the networks\n",
    "    output = keras.layers.Add(name='output')([\n",
    "        g1,\n",
    "        keras.layers.Multiply(name='g2_g3')([g2, g3])\n",
    "    ])\n",
    "    output = keras.layers.Activation('sigmoid', name='sigmoid')(output)\n",
    "\n",
    "    # Define the model\n",
    "    model = keras.models.Model(inputs=[input_x, input_f], outputs=output)\n",
    "\n",
    "    model.compile(\n",
    "        # optimizer=keras.optimizers.Adam(0.01),#grad(learning_rate=0.01, epsilon=1e-7),\n",
    "        optimizer=keras.optimizers.Adagrad(learning_rate=0.01, epsilon=1e-7),\n",
    "        loss='binary_crossentropy',\n",
    "        metrics=['accuracy']\n",
    "    )\n",
    "    \n",
    "    return model\n",
    "\n",
    "def map_to_RA(x):\n",
    "    if x >= 0.5:\n",
    "        return 1\n",
    "    else:\n",
    "        return 0\n",
    "    \n",
    "def fun_Tuning(f, score, decision, ProjectName='tmp'):\n",
    "    f_tr = f\n",
    "    s_tr = score\n",
    "    d_tr = decision\n",
    "    \n",
    "    ## Regression\n",
    "    tuner_reg = BayesianOptimization(\n",
    "        build_model_reg,\n",
    "        objective='val_loss',\n",
    "        project_name=ProjectName+'_reg',\n",
    "        # max_trials = 10,\n",
    "        alpha=0.0001,\n",
    "        beta=2.6,\n",
    "        overwrite=True\n",
    "    )\n",
    "\n",
    "    fix_seed(seed)\n",
    "    tuner_reg.search(\n",
    "        f_tr.values,\n",
    "        s_tr.values,\n",
    "        batch_size=32,\n",
    "        epochs=1000,\n",
    "        validation_split=0.1,\n",
    "        verbose=0,\n",
    "        callbacks=[early_stop]\n",
    "    )\n",
    "\n",
    "    ## Classification\n",
    "    tuner_cls = BayesianOptimization(\n",
    "        build_model_cls,\n",
    "        objective='val_loss',\n",
    "        project_name=ProjectName+'_cls',\n",
    "        # max_trials = 10,\n",
    "        alpha=0.0001,\n",
    "        beta=2.6,\n",
    "        overwrite=True\n",
    "    )\n",
    "\n",
    "    fix_seed(seed)\n",
    "    tuner_cls.search(\n",
    "        f_tr.values,\n",
    "        d_tr.values,\n",
    "        batch_size=32,\n",
    "        epochs=1000,\n",
    "        validation_split=0.1,\n",
    "        verbose=0,\n",
    "        callbacks=[early_stop]\n",
    "    )\n",
    "\n",
    "    return tuner_reg, tuner_cls\n",
    "\n",
    "def fun_TuningAffine(x, f, score, decision, ProjectName='tmp'):\n",
    "    x_tr = x\n",
    "    f_tr = f\n",
    "    s_tr = score\n",
    "    d_tr = decision\n",
    "    \n",
    "    ## Regression\n",
    "    tuner_reg = BayesianOptimization(\n",
    "        build_model_reg_Affine,\n",
    "        objective='val_loss',\n",
    "        project_name=ProjectName+'_reg',\n",
    "        # max_trials = 10,\n",
    "        alpha=0.0001,\n",
    "        beta=2.6,\n",
    "        overwrite=True\n",
    "    )\n",
    "\n",
    "    fix_seed(seed)\n",
    "    tuner_reg.search(\n",
    "        [x_tr.values, f_tr.values],\n",
    "        s_tr.values,\n",
    "        batch_size=32,\n",
    "        epochs=1000,\n",
    "        validation_split=0.1,\n",
    "        verbose=0,\n",
    "        callbacks=[early_stop]\n",
    "    )\n",
    "\n",
    "    ## Classification\n",
    "    tuner_cls = BayesianOptimization(\n",
    "        build_model_cls_Affine,\n",
    "        objective='val_loss',\n",
    "        project_name=ProjectName+'_cls',\n",
    "        # max_trials = 10,\n",
    "        alpha=0.0001,\n",
    "        beta=2.6,\n",
    "        overwrite=True\n",
    "    )\n",
    "\n",
    "    fix_seed(seed)\n",
    "    tuner_cls.search(\n",
    "        [x_tr.values, f_tr.values],\n",
    "        d_tr.values,\n",
    "        batch_size=32,\n",
    "        epochs=1000,\n",
    "        validation_split=0.1,\n",
    "        verbose=0,\n",
    "        callbacks=[early_stop]\n",
    "    )\n",
    "\n",
    "    return tuner_reg, tuner_cls\n",
    "\n",
    "def fun_Training(f, score, decision, tuner_reg, tuner_cls, ScalParams):\n",
    "    f_tr = f[0]\n",
    "    f_te = f[1]\n",
    "    s_tr = score#[0]\n",
    "    # s_te = score[1]\n",
    "    d_tr = decision#[0]\n",
    "    # d_te = decision[1]\n",
    "    \n",
    "    ## Regression\n",
    "    best_hps = tuner_reg.get_best_hyperparameters(num_trials=1)[0]\n",
    "    model_reg = tuner_reg.hypermodel.build(best_hps)\n",
    "\n",
    "    fix_seed(seed)\n",
    "    history_reg = model_reg.fit(\n",
    "        f_tr,\n",
    "        s_tr,\n",
    "        batch_size=32,\n",
    "        epochs=1000,\n",
    "        validation_split=0.1,\n",
    "        verbose=0,\n",
    "        callbacks=[early_stop, PrintDot()])\n",
    "\n",
    "    fits = model_reg.predict(f_tr).reshape(-1)\n",
    "    pred = model_reg.predict(f_te).reshape(-1)\n",
    "    s_fits = pd.Series(fun_invNormScale(fits, ScalParams), f_tr.index)\n",
    "    s_pred = pd.Series(fun_invNormScale(pred, ScalParams), f_te.index)\n",
    "    \n",
    "    ## Classification\n",
    "    best_hps = tuner_cls.get_best_hyperparameters(num_trials=1)[0]\n",
    "    model_cls = tuner_cls.hypermodel.build(best_hps)\n",
    "\n",
    "    fix_seed(seed)\n",
    "    history_reg = model_cls.fit(\n",
    "        f_tr,\n",
    "        d_tr,\n",
    "        batch_size=32,\n",
    "        epochs=1000,\n",
    "        validation_split=0.1,\n",
    "        verbose=0,\n",
    "        callbacks=[early_stop, PrintDot()])\n",
    "\n",
    "    d_fits = model_cls.predict(f_tr).reshape(-1)\n",
    "    d_pred = model_cls.predict(f_te).reshape(-1)\n",
    "    ra_fits = np.array([map_to_RA(x) for x in d_fits])\n",
    "    ra_pred = np.array([map_to_RA(x) for x in d_pred])\n",
    "    \n",
    "    d_fits = pd.Series(d_fits, index=f_tr.index)\n",
    "    d_pred = pd.Series(d_pred, index=f_te.index)\n",
    "    ra_fits = pd.Series(ra_fits, index=f_tr.index)\n",
    "    ra_pred = pd.Series(ra_pred, index=f_te.index)\n",
    "\n",
    "    return model_reg, model_cls, {\n",
    "        'Score fits' : s_fits,\n",
    "        'Score pred' : s_pred,\n",
    "        'DecisionP fits' : d_fits,\n",
    "        'DecisionP pred' : d_pred,\n",
    "        'Decision fits' : ra_fits,\n",
    "        'Decision pred' : ra_pred\n",
    "    }\n",
    "\n",
    "def fun_TrainingAffine(x, f, score, decision, tuner_reg, tuner_cls, ScalParams):\n",
    "    x_tr = x[0]\n",
    "    x_te = x[1]\n",
    "    f_tr = f[0]\n",
    "    f_te = f[1]\n",
    "    s_tr = score#[0]\n",
    "    # s_te = score[1]\n",
    "    d_tr = decision#[0]\n",
    "    # d_te = decision[1]\n",
    "    \n",
    "    ## Regression\n",
    "    best_hps = tuner_reg.get_best_hyperparameters(num_trials=1)[0]\n",
    "    model_reg = tuner_reg.hypermodel.build(best_hps)\n",
    "\n",
    "    fix_seed(seed)\n",
    "    history_reg = model_reg.fit(\n",
    "        [x_tr, f_tr],\n",
    "        s_tr,\n",
    "        batch_size=32,\n",
    "        epochs=1000,\n",
    "        validation_split=0.1,\n",
    "        verbose=0,\n",
    "        callbacks=[early_stop, PrintDot()])\n",
    "\n",
    "    fits = model_reg.predict([x_tr, f_tr]).reshape(-1)\n",
    "    pred = model_reg.predict([x_te, f_te]).reshape(-1)\n",
    "    s_fits = pd.Series(fun_invNormScale(fits, ScalParams), f_tr.index)\n",
    "    s_pred = pd.Series(fun_invNormScale(pred, ScalParams), f_te.index)\n",
    "    \n",
    "    ## Classification\n",
    "    best_hps = tuner_cls.get_best_hyperparameters(num_trials=1)[0]\n",
    "    model_cls = tuner_cls.hypermodel.build(best_hps)\n",
    "\n",
    "    fix_seed(seed)\n",
    "    history_reg = model_cls.fit(\n",
    "        [x_tr, f_tr],\n",
    "        d_tr,\n",
    "        batch_size=32,\n",
    "        epochs=1000,\n",
    "        validation_split=0.1,\n",
    "        verbose=0,\n",
    "        callbacks=[early_stop, PrintDot()])\n",
    "\n",
    "    d_fits = model_cls.predict([x_tr, f_tr]).reshape(-1)\n",
    "    d_pred = model_cls.predict([x_te, f_te]).reshape(-1)\n",
    "    ra_fits = np.array([map_to_RA(x) for x in d_fits])\n",
    "    ra_pred = np.array([map_to_RA(x) for x in d_pred])\n",
    "    \n",
    "    d_fits = pd.Series(d_fits, index=f_tr.index)\n",
    "    d_pred = pd.Series(d_pred, index=f_te.index)\n",
    "    ra_fits = pd.Series(ra_fits, index=f_tr.index)\n",
    "    ra_pred = pd.Series(ra_pred, index=f_te.index)\n",
    "\n",
    "    return model_reg, model_cls, {\n",
    "        'Score fits' : s_fits,\n",
    "        'Score pred' : s_pred,\n",
    "        'DecisionP fits' : d_fits,\n",
    "        'DecisionP pred' : d_pred,\n",
    "        'Decision fits' : ra_fits,\n",
    "        'Decision pred' : ra_pred\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "894c93a0-8b4d-4047-9609-d3f768df1255",
   "metadata": {},
   "outputs": [],
   "source": [
    "def fun_MainFE(df_f, df_y, df_l, n_try, ProjectName):\n",
    "    df_f_tr = df_f[0]\n",
    "    df_f_te = df_f[1]\n",
    "    df_y_tr = df_y[0]\n",
    "    df_y_te = df_y[1]\n",
    "    df_l_tr = df_l[0]\n",
    "    df_l_te = df_l[1]\n",
    "    \n",
    "    fix_seed(seed)\n",
    "    split_seed = np.random.randint(low=1, high=n_try**2, size=n_try)\n",
    "    text_id = df_y_tr.index\n",
    "    \n",
    "    rmse_list = []\n",
    "    acc_list = []\n",
    "    for i_try in range(n_try):\n",
    "        print(ProjectName+' : '+str(i_try))\n",
    "        ## Split\n",
    "        fix_seed(seed)\n",
    "        train_id, test_id = train_test_split(text_id, test_size=0.1, random_state=split_seed[i_try])\n",
    "\n",
    "        score_train = df_y_tr.loc[train_id]\n",
    "        score_test = df_y_te\n",
    "        decision_train = df_l_tr.loc[train_id]\n",
    "        decision_test = df_l_te\n",
    "        f_train = df_f_tr.loc[train_id]\n",
    "        f_test = df_f_te\n",
    "\n",
    "        score_train_scal, ScalParams = fun_NormScale(score_train)\n",
    "        score_test_scal, _ = fun_NormScale(score_test, params=ScalParams)\n",
    "\n",
    "        try:\n",
    "            tuner_reg\n",
    "            tuner_cls\n",
    "        except NameError:\n",
    "            tuner_reg, tuner_cls = fun_Tuning(f=f_train, score=score_train_scal, decision=decision_train, ProjectName=ProjectName)\n",
    "\n",
    "        model_reg, model_cls, logits = fun_Training(f=[f_train, f_test], score=score_train_scal, decision=decision_train, tuner_reg=tuner_reg, tuner_cls=tuner_cls, ScalParams=ScalParams)\n",
    "\n",
    "        rmse_list.append(mean_squared_error(score_test, logits['Score pred'], squared=False))\n",
    "        acc_list.append(accuracy_score(decision_test, logits['Decision pred']))\n",
    "\n",
    "        json_string = model_reg.to_json()\n",
    "        open(out_dir+ProjectName+'/FEModelReg'+str(i_try)+'.json', 'w').write(json_string)\n",
    "        model_reg.save_weights(out_dir+ProjectName+'/FEModelReg'+str(i_try)+'.hdf5')\n",
    "        \n",
    "        json_string = model_cls.to_json()\n",
    "        open(out_dir+ProjectName+'/FEModelCls'+str(i_try)+'.json', 'w').write(json_string)\n",
    "        model_cls.save_weights(out_dir+ProjectName+'/FEModelCls'+str(i_try)+'.hdf5')\n",
    "\n",
    "        clear_output(True)\n",
    "        \n",
    "        del model_reg, model_cls\n",
    "    \n",
    "    del tuner_reg, tuner_cls\n",
    "    return rmse_list, acc_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb57dbf4-2821-4c81-acad-b01cf48ea4ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "def fun_MainAffine(df_x, df_f, df_y, df_l, n_try, ProjectName):\n",
    "    df_x_tr = df_x[0]\n",
    "    df_x_te = df_x[1]\n",
    "    df_f_tr = df_f[0]\n",
    "    df_f_te = df_f[1]\n",
    "    df_y_tr = df_y[0]\n",
    "    df_y_te = df_y[1]\n",
    "    df_l_tr = df_l[0]\n",
    "    df_l_te = df_l[1]\n",
    "    \n",
    "    fix_seed(seed)\n",
    "    split_seed = np.random.randint(low=1, high=n_try**2, size=n_try)\n",
    "    text_id = df_y_tr.index\n",
    "    \n",
    "    rmse_list = []\n",
    "    acc_list = []\n",
    "    for i_try in range(n_try):\n",
    "        print(ProjectName+' : '+str(i_try))\n",
    "        ## Split\n",
    "        fix_seed(seed)\n",
    "        train_id, test_id = train_test_split(text_id, test_size=0.1, random_state=split_seed[i_try])\n",
    "\n",
    "        score_train = df_y_tr.loc[train_id]\n",
    "        score_test = df_y_te\n",
    "        decision_train = df_l_tr.loc[train_id]\n",
    "        decision_test = df_l_te\n",
    "        f_train = df_f_tr.loc[train_id]\n",
    "        f_test = df_f_te\n",
    "        x_train = df_x_tr.loc[train_id]\n",
    "        x_test = df_x_te\n",
    "\n",
    "        score_train_scal, ScalParams = fun_NormScale(score_train)\n",
    "        score_test_scal, _ = fun_NormScale(score_test, params=ScalParams)\n",
    "\n",
    "        try:\n",
    "            tuner_reg\n",
    "            tuner_cls\n",
    "        except NameError:\n",
    "            tuner_reg, tuner_cls = fun_TuningAffine(x=x_train, f=f_train, score=score_train_scal, decision=decision_train, ProjectName=ProjectName)\n",
    "\n",
    "        model_reg, model_cls, logits = fun_TrainingAffine(x=[x_train, x_test], f=[f_train, f_test], score=score_train_scal, decision=decision_train, tuner_reg=tuner_reg, tuner_cls=tuner_cls, ScalParams=ScalParams)\n",
    "\n",
    "        rmse_list.append(mean_squared_error(score_test, logits['Score pred'], squared=False))\n",
    "        acc_list.append(accuracy_score(decision_test, logits['Decision pred']))\n",
    "        \n",
    "        json_string = model_reg.to_json()\n",
    "        open(out_dir+ProjectName+'/AffineModelReg'+str(i_try)+'.json', 'w').write(json_string)\n",
    "        model_reg.save_weights(out_dir+ProjectName+'/AffineModelReg'+str(i_try)+'.hdf5')\n",
    "        \n",
    "        json_string = model_cls.to_json()\n",
    "        open(out_dir+ProjectName+'/AffineModelCls'+str(i_try)+'.json', 'w').write(json_string)\n",
    "        model_cls.save_weights(out_dir+ProjectName+'/AffineModelCls'+str(i_try)+'.hdf5')\n",
    "\n",
    "        clear_output(True)\n",
    "        \n",
    "        del model_reg, model_cls\n",
    "    \n",
    "    del tuner_reg, tuner_cls\n",
    "    return rmse_list, acc_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6e12dbe-0770-4d09-b2cd-8deb5079046a",
   "metadata": {},
   "outputs": [],
   "source": [
    "ProjectNames = ['BERT', 'SciBERT', 'GPT-3', 'T5']\n",
    "n_try = 100\n",
    "\n",
    "for ProjectName in ProjectNames:\n",
    "    fix_seed(seed)\n",
    "    # BERT\n",
    "    if ProjectName == 'BERT':\n",
    "        df_enc = pd.read_csv('../10_Data/df_enc_BERT.csv', index_col=0)\n",
    "    # SciBERT\n",
    "    elif ProjectName == 'SciBERT':\n",
    "        df_enc = pd.read_csv('../10_Data/df_enc_SciBERT.csv', index_col=0)\n",
    "    # GPT-3\n",
    "    elif ProjectName == 'GPT-3':\n",
    "        df_enc = pd.read_csv('../10_Data/df_enc_GPT-3.csv', index_col=0)\n",
    "    # T5\n",
    "    elif ProjectName == 'T5':\n",
    "        df_enc = pd.read_csv('../10_Data/df_enc_T5.csv', index_col=0)\n",
    "\n",
    "    # df = pd.read_csv('../10_Data/df.csv', index_col=1)\n",
    "    df = pd.read_csv('../10_Data/df_removeURL.csv', index_col=1)\n",
    "    csv_train = pd.read_csv('../10_Data/df_train.csv', index_col=1)\n",
    "    csv_test = pd.read_csv('../10_Data/df_test.csv', index_col=1)\n",
    "    train_id = csv_train.index\n",
    "    test_id = csv_test.index\n",
    "\n",
    "    ## Encoding\n",
    "    text = df['abstract (wo URL)']\n",
    "    vectorizer = CountVectorizer(max_df=0.9, min_df=0.01, stop_words='english', ngram_range=(2,2))\n",
    "    vector = vectorizer.fit_transform(text)\n",
    "    df_bow = pd.DataFrame(vector.toarray(), index=df.index, columns=vectorizer.get_feature_names_out())\n",
    "    df_bow = df_bow.iloc[:,9:]\n",
    "\n",
    "    ## Pre-processing\n",
    "    label_map = {'R': 0, 'A': 1}\n",
    "    binary_labels = pd.Series(np.array([label_map[label] for label in df['decision']]), index=df.index)\n",
    "\n",
    "    dim_x = df_bow.shape[1]\n",
    "    dim_f = df_enc.shape[1]\n",
    "\n",
    "    ## Extract common indices\n",
    "    common_indices_train = train_id.intersection(df_bow.index).intersection(df_enc.index).intersection(df.index)\n",
    "    common_indices_test = test_id.intersection(df_bow.index).intersection(df_enc.index).intersection(df.index)\n",
    "\n",
    "    df_bow_train = df_bow.loc[common_indices_train]\n",
    "    df_bow_test = df_bow.loc[common_indices_test]\n",
    "    df_enc_train = df_enc.loc[common_indices_train]\n",
    "    df_enc_test = df_enc.loc[common_indices_test]\n",
    "    df_train = df.loc[common_indices_train]\n",
    "    df_test = df.loc[common_indices_test]\n",
    "    binary_labels_train = binary_labels.loc[common_indices_train]\n",
    "    binary_labels_test = binary_labels.loc[common_indices_test]\n",
    "\n",
    "    if not os.path.isdir(out_dir+ProjectName):\n",
    "        os.makedirs(out_dir+ProjectName)\n",
    "\n",
    "    rmse_list_FE, acc_list_FE = fun_MainFE(\n",
    "        df_f = [df_enc_train, df_enc_test],\n",
    "        df_y = [df_train['mean_rating'], df_test['mean_rating']],\n",
    "        df_l = [binary_labels_train, binary_labels_test],\n",
    "        n_try = n_try,\n",
    "        ProjectName = ProjectName\n",
    "    )\n",
    "\n",
    "    rmse_list_Affine, acc_list_Affine = fun_MainAffine(\n",
    "        df_x = [df_bow_train, df_bow_test],\n",
    "        df_f = [df_enc_train, df_enc_test],\n",
    "        df_y = [df_train['mean_rating'], df_test['mean_rating']],\n",
    "        df_l = [binary_labels_train, binary_labels_test],\n",
    "        n_try = n_try,\n",
    "        ProjectName = ProjectName\n",
    "    )\n",
    "\n",
    "    results = pd.DataFrame({\n",
    "        'RMSE ('+ ProjectName+'-FE)' : rmse_list_FE,\n",
    "        'Accuracy ('+ ProjectName+'-FE)' : acc_list_FE,\n",
    "        'RMSE ('+ ProjectName+'-Affine)' : rmse_list_Affine,\n",
    "        'Accuracy ('+ ProjectName+'-Affine)' : acc_list_Affine\n",
    "    })\n",
    "    results.to_csv(out_dir+'Results-'+ProjectName+'.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e74b47af-9634-4ed1-8ad5-0949433a3e8a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
