{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost import CatBoostClassifier, Pool\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import random\n",
    "import re\n",
    "import psutil\n",
    "import gc\n",
    "import time\n",
    "import pickle\n",
    "import pynvml\n",
    "\n",
    "random.seed(42)\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to clean folder names\n",
    "def clean_folder_name(folder_name):\n",
    "    # Remove invalid characters\n",
    "    cleaned_name = re.sub(r'[<>:\"/\\\\|?*]', '', folder_name)\n",
    "    # Remove trailing dots and spaces\n",
    "    cleaned_name = cleaned_name.rstrip('. ')\n",
    "    return cleaned_name\n",
    "\n",
    "\n",
    "def CPU_monitor_memory_usage():\n",
    "    memory_info = psutil.virtual_memory()\n",
    "    memory_usage = memory_info.percent\n",
    "        \n",
    "    print(f\"CPU Current memory usage: {memory_usage}%\")\n",
    "\n",
    "    if memory_usage >= 95:\n",
    "        print(\"CPU Memory usage is too high. Pausing execution...\")\n",
    "        gc.collect()  # Trigger garbage collection manually\n",
    "        while memory_usage > 30:\n",
    "            time.sleep(10)\n",
    "            memory_info = psutil.virtual_memory()\n",
    "            memory_usage = memory_info.percent\n",
    "        print(\"CPU Memory usage is low enough. Resuming execution...\")\n",
    "\n",
    "    # time.sleep(5)\n",
    "\n",
    "def monitor_gpu_memory():\n",
    "    # Initialize NVML\n",
    "    pynvml.nvmlInit()\n",
    "    \n",
    "    try:\n",
    "        # Get handle for the first GPU\n",
    "        handle = pynvml.nvmlDeviceGetHandleByIndex(0)\n",
    "\n",
    "        # Get memory info\n",
    "        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)\n",
    "        total_memory = mem_info.total\n",
    "        used_memory = mem_info.used\n",
    "\n",
    "        # Calculate the percentage of GPU memory used\n",
    "        memory_usage = (used_memory / total_memory) * 100\n",
    "        print(f\"Current GPU memory usage: {memory_usage:.2f}%\")\n",
    "\n",
    "        # Check if memory usage is too high\n",
    "        if memory_usage >= 95:\n",
    "            print(\"GPU memory usage is too high. Pausing execution...\")\n",
    "            while memory_usage > 30:\n",
    "                time.sleep(10)\n",
    "                mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)\n",
    "                used_memory = mem_info.used\n",
    "                memory_usage = (used_memory / total_memory) * 100\n",
    "            print(\"GPU memory usage is low enough. Resuming execution...\")\n",
    "\n",
    "    finally:\n",
    "        # Clean up\n",
    "        pynvml.nvmlShutdown()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bch_classification_report_to_df(report, y_true, y_pred):\n",
    "    global bch_class_df\n",
    "    df = pd.DataFrame(report).transpose()\n",
    "\n",
    "    # Calculate the confusion matrix\n",
    "    labels = df.index[:-3]  # Exclude 'accuracy', 'macro avg', 'weighted avg'\n",
    "    cm = confusion_matrix(y_true, y_pred, labels=labels)\n",
    "    \n",
    "    # Calculate Sensitivity (same as recall)\n",
    "    df['Sensitivity'] = df['recall']\n",
    "    \n",
    "    # Calculate Specificity\n",
    "    tn = cm.sum() - (cm.sum(axis=0) + cm.sum(axis=1) - np.diag(cm))\n",
    "    fp = cm.sum(axis=0) - np.diag(cm)\n",
    "    specificity = tn / (tn + fp)\n",
    "    \n",
    "    # Assign computed specificity to dataframe except for the last three rows\n",
    "    df.loc[df.index[:-3], 'Specificity'] = specificity\n",
    "    \n",
    "    # Handling special cases\n",
    "    # Set 'accuracy' row sensitivity and specificity to the accuracy value\n",
    "    accuracy = df.loc['accuracy', 'precision']  # assuming 'precision' contains the accuracy\n",
    "    df.loc['accuracy', ['Sensitivity', 'Specificity']] = accuracy\n",
    "    \n",
    "    # Calculate 'macro avg' and 'weighted avg' for sensitivity and specificity\n",
    "    df.loc['macro avg', 'Sensitivity'] = df.iloc[:-3]['Sensitivity'].mean()\n",
    "    df.loc['weighted avg', 'Sensitivity'] = np.average(df.iloc[:-3]['Sensitivity'], weights=df.iloc[:-3]['support'])\n",
    "    \n",
    "    df.loc['macro avg', 'Specificity'] = df.iloc[:-3]['Specificity'].mean()\n",
    "    df.loc['weighted avg', 'Specificity'] = np.average(df.iloc[:-3]['Specificity'], weights=df.iloc[:-3]['support'])\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_bch(X_train_re, X_test_re, Y_train_re, Y_test_re, catboost_params):\n",
    "    CPU_monitor_memory_usage()\n",
    "    monitor_gpu_memory()\n",
    "    bch_dict = {}\n",
    "\n",
    "    train_pool_re = Pool(\n",
    "        X_train_re[[\"text\", \"area_TEIS\"]],\n",
    "        Y_train_re,\n",
    "        text_features=[\"text\"],\n",
    "        cat_features=[\"area_TEIS\"]\n",
    "    )\n",
    "    valid_pool_re = Pool(\n",
    "        X_test_re[[\"text\", \"area_TEIS\"]],\n",
    "        Y_test_re,\n",
    "        text_features=[\"text\"],\n",
    "        cat_features=[\"area_TEIS\"]\n",
    "    )\n",
    "\n",
    "    # Model Training\n",
    "    model_re = CatBoostClassifier(**catboost_params)\n",
    "    start_time = time.time()  # Start timing\n",
    "    model_re.fit(train_pool_re, eval_set=valid_pool_re)\n",
    "    training_time = time.time() - start_time  # End timing\n",
    "\n",
    "    # Save the retrain performances\n",
    "    predictions = model_re.predict(X_test_re[[\"text\", \"area_TEIS\"]])\n",
    "    accuracy = accuracy_score(Y_test_re, predictions)\n",
    "    report = classification_report(Y_test_re, predictions, digits=3, output_dict=True)\n",
    "    print(accuracy)\n",
    "    print(report)\n",
    "    classification_df = bch_classification_report_to_df(report, Y_test_re, predictions)\n",
    "\n",
    "    classification_df.to_pickle(\"D:/AutoGeTS/Models_and_Performances/Benchmark_M0_Classdf_0.pkl\")\n",
    "    classification_df.to_csv(\"D:/AutoGeTS/Models_and_Performances/Benchmark_M0_Classdf_0.csv\", index=True)\n",
    "\n",
    "    bch_dict['model'] = model_re\n",
    "    bch_dict['classification_df'] = classification_df\n",
    "    bch_dict['accuracy'] = accuracy\n",
    "    bch_dict['retraining_time'] = training_time\n",
    "\n",
    "    return bch_dict\n",
    "\n",
    "def create_pca_df(X_transformed, df_original):\n",
    "    pca_columns = [f'PCA_{i}' for i in range(X_transformed.shape[1])]\n",
    "    pca_df = pd.DataFrame(X_transformed, columns=pca_columns, index=df_original.index)\n",
    "    # Reset index if needed\n",
    "    df_original_reset = df_original.reset_index(drop=True)\n",
    "    pca_df_reset = pca_df.reset_index(drop=True)\n",
    "    # Concatenate DataFrames\n",
    "    merged_df = pd.concat([df_original_reset[df_original_reset.columns], pca_df_reset], axis=1)\n",
    "    return merged_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if __name__ == \"__main__\":\n",
    "    # Load Data\n",
    "    data = pd.read_csv(f'D:/AutoGeTS/Data/tickets_topics.csv',lineterminator='\\n')\n",
    "    data_topic = data.dropna().reset_index()\n",
    "    # Rename the 'index' column to 'index_meta'\n",
    "    data_topic = data_topic.rename(columns={'index': 'index_meta'})\n",
    "\n",
    "    X_train_r, X_test_re, Y_train_r, Y_test_re = train_test_split(data_topic, data_topic.topic_name, test_size = 0.2,random_state = 42)\n",
    "\n",
    "    catboost_params = {'iterations': 300, 'learning_rate': 0.2, 'depth': 8, 'l2_leaf_reg': 1, \n",
    "                       'bagging_temperature': 1, 'random_strength': 1, 'border_count': 254, \n",
    "                       'eval_metric': 'TotalF1', 'task_type': 'GPU', 'early_stopping_rounds': 20, 'use_best_model': True, 'verbose': 1, 'random_seed': 0}\n",
    "\n",
    "    # catboost_params = {'iterations': 300, 'learning_rate': 0.5, 'depth': 6, 'l2_leaf_reg': 10, \n",
    "    #                    'bagging_temperature': 1, 'random_strength': 1, 'border_count': 254, \n",
    "    #                    'eval_metric': 'TotalF1', 'task_type': 'GPU', 'early_stopping_rounds': 20, 'use_best_model': True, 'verbose': 1, 'random_seed': 0}\n",
    "\n",
    "    bch_dict = train_bch(X_train_r, X_test_re, Y_train_r, Y_test_re, catboost_params)\n",
    "\n",
    "    with open(\"D:/AutoGeTS/Models_and_Performances/Benchmark_M0_dict_0.pkl\", 'wb') as file:\n",
    "        pickle.dump(bch_dict, file)\n",
    "    \n",
    "    # # Extract the text column\n",
    "    # texts = data_topic['text']\n",
    "    # # Initialize the TF-IDF Vectorizer\n",
    "    # vectorizer = TfidfVectorizer()\n",
    "    # # Transform the text data into TF-IDF vectors\n",
    "    # X = vectorizer.fit_transform(texts)\n",
    "    # # Initialize PCA\n",
    "    # pca = PCA(n_components=20, random_state=42)\n",
    "    # # Convert the sparse matrix to a dense matrix since PCA doesn't support sparse input\n",
    "    # X_dense = X.toarray()\n",
    "    # # Apply PCA\n",
    "    # X_embedded_syn = pca.fit_transform(X_dense)\n",
    "    # data_pca_df = create_pca_df(X_embedded_syn, data_topic)\n",
    "\n",
    "    # # Load the model and make predictions on the training set\n",
    "    # model_re = bch_dict['model']\n",
    "    # train_predictions = model_re.predict(data_topic[[\"text\", \"area_TEIS\"]])\n",
    "\n",
    "    # # Flatten the list of lists\n",
    "    # flattened_predictions = [item[0] for item in train_predictions]\n",
    "\n",
    "    # # Add the predictions to the data_topic DataFrame\n",
    "    # data_topic[\"pred_topic_name\"] = flattened_predictions\n",
    "\n",
    "    # # Here we assume the training set is the same as X_train_r based on 'index_meta'\n",
    "    # extracted_train_set = data_pca_df[data_pca_df['index_meta'].isin(X_train_r['index_meta'])]\n",
    "    # # Reorder the rows according to the order of index_meta in X_train_r\n",
    "    # extracted_train_set = extracted_train_set.set_index('index_meta').loc[X_train_r['index_meta']].reset_index()\n",
    "\n",
    "    # # Append the new column \"pred_topic_name\" to the extracted df using the \"index_meta\" column\n",
    "    # extracted_train_set = extracted_train_set.merge(\n",
    "    #     data_topic[['index_meta', 'pred_topic_name']],\n",
    "    #     on='index_meta',\n",
    "    #     how='left'\n",
    "    # )\n",
    "        \n",
    "    # print(extracted_train_set)\n",
    "\n",
    "    # extracted_train_set.to_pickle(\"D:/AutoGeTS/Data/Train_PCA_YZ_withPred_1.pkl\")\n",
    "    # extracted_train_set.to_csv(\"D:/AutoGeTS/Data/Train_PCA_YZ_withPred_1.csv\", index=False)\n",
    "\n",
    "    # # Here we assume the training set is the same as X_train_r based on 'index_meta'\n",
    "    # extracted_test_set = data_pca_df[data_pca_df['index_meta'].isin(X_test_re['index_meta'])]\n",
    "    # # Reorder the rows according to the order of index_meta in X_train_r\n",
    "    # extracted_test_set = extracted_test_set.set_index('index_meta').loc[X_test_re['index_meta']].reset_index()\n",
    "\n",
    "    # # Append the new column \"pred_topic_name\" to the extracted df using the \"index_meta\" column\n",
    "    # extracted_test_set = extracted_test_set.merge(\n",
    "    #     data_topic[['index_meta', 'pred_topic_name']],\n",
    "    #     on='index_meta',\n",
    "    #     how='left'\n",
    "    # )\n",
    "\n",
    "    # extracted_test_set.to_pickle(\"D:/AutoGeTS/Data/Test_PCA_YZ_withPred_1.pkl\")\n",
    "    # extracted_test_set.to_csv(\"D:/AutoGeTS/Data/Test_PCA_YZ_withPred_1.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(bch_dict[\"model\"].get_all_params())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
