{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "791feaec",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "executionInfo": {
     "elapsed": 367,
     "status": "ok",
     "timestamp": 1725004556513,
     "user": {
      "displayName": "",
      "userId": "02254144524654019702"
     },
     "user_tz": -120
    },
    "id": "791feaec",
    "outputId": "3aa0edbf-30a6-48c1-f7c9-ea80235136d7"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.google.colaboratory.intrinsic+json": {
       "type": "string"
      },
      "text/plain": [
       "'3.10.12 (main, Jul 29 2024, 16:56:48) [GCC 11.4.0]'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import sys\n",
    "import os\n",
    "sys.version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55c30a4d",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 27028,
     "status": "ok",
     "timestamp": 1725004583834,
     "user": {
      "displayName": "",
      "userId": "02254144524654019702"
     },
     "user_tz": -120
    },
    "id": "55c30a4d",
    "outputId": "8d66e90a-fe6d-4495-e816-2f1d741969fe"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mounted at /content/drive\n",
      "Pytorch version: 2.4.0+cu121\n",
      "Device name: Tesla T4\n"
     ]
    }
   ],
   "source": [
    "from google.colab import drive\n",
    "import torch\n",
    "\n",
    "drive.mount('/content/drive', force_remount=True)\n",
    "path = \"/content/drive/My Drive/EC-GitHub/\"\n",
    "os.chdir(path)\n",
    "\n",
    "if torch.cuda.is_available():\n",
    "    device = torch.device(\"cuda\")\n",
    "    print(f\"Pytorch version: {torch.__version__}\")\n",
    "    print(f\"Device name: {torch.cuda.get_device_name(0)}\")\n",
    "else:\n",
    "    device = torch.device(\"cpu\")\n",
    "    print(\"No GPU available.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69c873e7",
   "metadata": {
    "id": "69c873e7"
   },
   "outputs": [],
   "source": [
    "# import libraries\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler\n",
    "import torch.optim as optim\n",
    "import torch.nn.functional as F\n",
    "from transformers import AdamW, DistilBertTokenizerFast, BertTokenizerFast, AlbertTokenizerFast, DistilBertModel, AlbertModel, BertModel\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "import time\n",
    "\n",
    "# useful .py\n",
    "from settings import * # settings\n",
    "from dataset import * # data pre-processing\n",
    "from model import * # models\n",
    "from optimization import * # model training, evaluation\n",
    "\n",
    "import warnings\n",
    "warnings.simplefilter('ignore')\n",
    "pd.set_option('display.max_columns', 500)\n",
    "pd.set_option('display.width', 100)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "rLK1Q_Dsxurp",
   "metadata": {
    "id": "rLK1Q_Dsxurp"
   },
   "source": [
    "### Fine-tuning pretrained models (AllTextBERT and LateFuseBERT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "EyiDjRYuRarP",
   "metadata": {
    "executionInfo": {
     "elapsed": 429,
     "status": "ok",
     "timestamp": 1726133025927,
     "user": {
      "displayName": "",
      "userId": "02254144524654019702"
     },
     "user_tz": -120
    },
    "id": "EyiDjRYuRarP"
   },
   "outputs": [],
   "source": [
    "# select DATASET\n",
    "DATASET = \"wine_100\" # choose in {\"cloth_4\", \"airbnb\",  \"kick\", \"pet_4\", \"salary_5, \"wine_10\", \"wine_100\"}\n",
    "FILENAME, categorical_var, numerical_var, text_var, MAX_LEN_QUANTILE, N_CLASSES, WEIGHT_DECAY, FACTOR, N_EPOCHS, split_val, CRITERION, N_SEED, DROPOUT= load_settings(dataset = DATASET)\n",
    "\n",
    "# performance records\n",
    "perf_results = pd.DataFrame()\n",
    "i = 0\n",
    "\n",
    "for MODEL_TYPE in [\"LateFuseBERT\", \"AllTextBERT\"]:\n",
    "\n",
    "    # load and prepare dataset\n",
    "    df_original = preprocess_dataset(DATASET, MODEL_TYPE)\n",
    "\n",
    "    for SEED in range(N_SEED):\n",
    "\n",
    "            start = time.time()\n",
    "\n",
    "            # original dataset\n",
    "            df = df_original.copy()\n",
    "\n",
    "            # GPU or CPU\n",
    "            device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "            # control randomness\n",
    "            random.seed(SEED)\n",
    "            np.random.seed(SEED)\n",
    "            torch.manual_seed(SEED)\n",
    "            torch.cuda.manual_seed(SEED)\n",
    "\n",
    "            # temporary dataframes to compute uncertainty metrics\n",
    "            uncertainty_results = pd.DataFrame()\n",
    "            val_uncertainty_results = pd.DataFrame()\n",
    "\n",
    "            perf_results.loc[i,\"model type\"] = MODEL_TYPE + \"_Bert\"\n",
    "            perf_results.loc[i,\"seed\"] = SEED\n",
    "\n",
    "            # Train/Test split\n",
    "            df, target = train_test_split(df, test_size = split_val, random_state = SEED)\n",
    "\n",
    "            # text cleaning (keep only words, numbers, and spaces)\n",
    "            df['clean_text'] = df[text_var].apply(lambda row:clean_text(row))\n",
    "            target['clean_text'] = target[text_var].apply(lambda row:clean_text(row))\n",
    "\n",
    "            # Load the specific tokenizer\n",
    "            tokenizer = BertTokenizerFast.from_pretrained(\"bert-base-uncased\", do_lower_case=True) # \"distilbert-base-uncased\", \"bert-base-uncased\", \"albert/albert-base-v2\"\n",
    "\n",
    "            # text max length\n",
    "            MAX_LEN = int(np.quantile(df.apply(lambda row : len(tokenizer(row['clean_text']).input_ids), axis=1).values, q = [MAX_LEN_QUANTILE]).item())\n",
    "            MAX_LEN = min(MAX_LEN, 512) # maximum sequence length is 512 for BERT family\n",
    "            perf_results.loc[i,\"max text length\"] = MAX_LEN\n",
    "\n",
    "            # Numerical variables pre-processing\n",
    "            numerical_var_scaled = standardScaling(df, target, numerical_var)\n",
    "            NUM_NUMERICAL_VAR = len(numerical_var)\n",
    "\n",
    "            # Categorical variables pre-processing\n",
    "            categorical_var_oe, CAT_VOCAB_SIZES = ordinalEncoding(df, target, categorical_var)\n",
    "            NUM_CAT_VAR = len(categorical_var)\n",
    "\n",
    "            # train / validation split\n",
    "            df_train, df_validation = train_test_split(df, test_size = split_val, random_state = SEED)\n",
    "            perf_results.loc[i,\"training size\"] = df_train.shape[0]\n",
    "            perf_results.loc[i,\"test size\"] = target.shape[0]\n",
    "\n",
    "            # hyper-parameters\n",
    "            LR, BATCH_SIZE, D_FC, N_EPOCHS, N_HEADS, N_LAYERS = load_pretrained_settings()\n",
    "            perf_results.loc[i,\"LR\"] = LR\n",
    "            perf_results.loc[i,\"BATCH_SIZE\"] = BATCH_SIZE\n",
    "            perf_results.loc[i,\"N_HEADS\"] = N_HEADS\n",
    "            perf_results.loc[i,\"N_LAYERS\"] = N_LAYERS\n",
    "\n",
    "            # prepare the Tensor Datasets, including tokenization\n",
    "            dataset_train = prepareTensorDatasetWithTokenizer(df_train, \"clean_text\", categorical_var_oe, numerical_var_scaled, 'Y', tokenizer, MAX_LEN, special_tokens=True)\n",
    "            dataset_validation = prepareTensorDatasetWithTokenizer(df_validation, \"clean_text\", categorical_var_oe, numerical_var_scaled, 'Y', tokenizer, MAX_LEN, special_tokens=True)\n",
    "            dataset_target = prepareTensorDatasetWithTokenizer(target, \"clean_text\", categorical_var_oe, numerical_var_scaled, 'Y', tokenizer, MAX_LEN, special_tokens=True)\n",
    "\n",
    "            # data loaders\n",
    "            loader_train = DataLoader(dataset_train, sampler = RandomSampler(dataset_train), batch_size = BATCH_SIZE)\n",
    "            loader_validation = DataLoader(dataset_validation, sampler = SequentialSampler(dataset_validation),batch_size = BATCH_SIZE)\n",
    "            loader_target = DataLoader(dataset_target, sampler = SequentialSampler(dataset_target),batch_size = BATCH_SIZE)\n",
    "\n",
    "            # Load Bert with a linear classification layer\n",
    "            BERT_model = BertModel.from_pretrained(\"bert-base-uncased\").to(device) # \"distilbert-base-uncased\", \"bert-base-uncased\", \"albert/albert-base-v2\"\n",
    "\n",
    "            # model initialization\n",
    "            torch.manual_seed(SEED)\n",
    "            model = init_model(model_type = MODEL_TYPE,\n",
    "                               d_model = BERT_model.embeddings.word_embeddings.embedding_dim, # dimension = 768 for BERT family\n",
    "                               cat_vocab_sizes = CAT_VOCAB_SIZES,\n",
    "                               num_cat_var = NUM_CAT_VAR,\n",
    "                               num_numerical_var = NUM_NUMERICAL_VAR,\n",
    "                               n_heads = N_HEADS,\n",
    "                               n_layers = N_LAYERS,\n",
    "                               dropout = DROPOUT,\n",
    "                               d_fc = D_FC,\n",
    "                               n_classes = N_CLASSES,\n",
    "                               seed = SEED,\n",
    "                               text_model = BERT_model).to(device)\n",
    "\n",
    "            # number of trainable parameters\n",
    "            perf_results.loc[i,\"trainable parameters\"] = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
    "\n",
    "            # optimizer\n",
    "            optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)\n",
    "\n",
    "            # training\n",
    "            model, epochs = training_pretrained(model, MODEL_TYPE, loader_train,  N_EPOCHS, loader_validation, CRITERION, optimizer, FACTOR, SEED, verbose=True, device = device)\n",
    "            perf_results.loc[i,\"epochs\"] = epochs\n",
    "            # save fine-tuned model\n",
    "            torch.save(model, 'trained_models/'+MODEL_TYPE+'_Bert_'+DATASET+'_'+str(SEED)+'_checkpoint.pt')\n",
    "\n",
    "            # model evaluation\n",
    "            model.eval()\n",
    "\n",
    "            target_perf = performance_pretrained(model, loader_target, MODEL_TYPE, SEED, device)\n",
    "            perf_results.loc[i,\"performance (Target)\"] = target_perf\n",
    "\n",
    "            elapsed_time = time.time()-start\n",
    "            perf_results.loc[i,\"time\"] = elapsed_time\n",
    "\n",
    "            i+=1\n",
    "\n",
    "    display(perf_results)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "RFIVr_AMvrfT",
   "metadata": {
    "id": "RFIVr_AMvrfT"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "T4",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
