{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-09-25T22:45:09.847679Z",
     "iopub.status.busy": "2024-09-25T22:45:09.846705Z",
     "iopub.status.idle": "2024-09-25T22:45:11.550340Z",
     "shell.execute_reply": "2024-09-25T22:45:11.549338Z"
    }
   },
   "outputs": [],
   "source": [
    "from catboost import CatBoostClassifier, Pool\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
    "import re\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import time\n",
    "import pickle\n",
    "import itertools\n",
    "import os\n",
    "import psutil\n",
    "import gc\n",
    "import random\n",
    "import pynvml\n",
    "\n",
    "random.seed(42)\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-09-25T22:45:11.556309Z",
     "iopub.status.busy": "2024-09-25T22:45:11.556309Z",
     "iopub.status.idle": "2024-09-25T22:45:11.581038Z",
     "shell.execute_reply": "2024-09-25T22:45:11.580043Z"
    }
   },
   "outputs": [],
   "source": [
    "# Function to clean folder names\n",
    "def clean_folder_name(folder_name):\n",
    "    # Remove invalid characters\n",
    "    cleaned_name = re.sub(r'[<>:\"/\\\\|?*]', '', folder_name)\n",
    "    # Remove trailing dots and spaces\n",
    "    cleaned_name = cleaned_name.rstrip('. ')\n",
    "    return cleaned_name\n",
    "\n",
    "\n",
    "def CPU_monitor_memory_usage():\n",
    "    memory_info = psutil.virtual_memory()\n",
    "    memory_usage = memory_info.percent\n",
    "        \n",
    "    print(f\"CPU Current memory usage: {memory_usage}%\")\n",
    "\n",
    "    if memory_usage >= 95:\n",
    "        print(\"CPU Memory usage is too high. Pausing execution...\")\n",
    "        gc.collect()  # Trigger garbage collection manually\n",
    "        while memory_usage > 30:\n",
    "            time.sleep(10)\n",
    "            memory_info = psutil.virtual_memory()\n",
    "            memory_usage = memory_info.percent\n",
    "        print(\"CPU Memory usage is low enough. Resuming execution...\")\n",
    "\n",
    "    # time.sleep(5)\n",
    "\n",
    "def monitor_gpu_memory():\n",
    "    # Initialize NVML\n",
    "    pynvml.nvmlInit()\n",
    "    \n",
    "    try:\n",
    "        # Get handle for the first GPU\n",
    "        handle = pynvml.nvmlDeviceGetHandleByIndex(0)\n",
    "\n",
    "        # Get memory info\n",
    "        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)\n",
    "        total_memory = mem_info.total\n",
    "        used_memory = mem_info.used\n",
    "\n",
    "        # Calculate the percentage of GPU memory used\n",
    "        memory_usage = (used_memory / total_memory) * 100\n",
    "        print(f\"Current GPU memory usage: {memory_usage:.2f}%\")\n",
    "\n",
    "        # Check if memory usage is too high\n",
    "        if memory_usage >= 95:\n",
    "            print(\"GPU memory usage is too high. Pausing execution...\")\n",
    "            while memory_usage > 30:\n",
    "                time.sleep(10)\n",
    "                mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)\n",
    "                used_memory = mem_info.used\n",
    "                memory_usage = (used_memory / total_memory) * 100\n",
    "            print(\"GPU memory usage is low enough. Resuming execution...\")\n",
    "\n",
    "    finally:\n",
    "        # Clean up\n",
    "        pynvml.nvmlShutdown()\n",
    "\n",
    "def dict_to_foldername(param_dict, max_values=2):\n",
    "    # Flatten the dictionary into a string of 'key=abbreviated_values' pairs\n",
    "    parts = []\n",
    "    for key, values in param_dict.items():\n",
    "        # Abbreviate key names\n",
    "        short_key = ''.join(word[0] for word in key.split('_'))\n",
    "        \n",
    "        # Convert list of values to a comma-separated string\n",
    "        value_str = ','.join(map(str, values))\n",
    "        # Form key=value string and append to the parts list\n",
    "        parts.append(f\"{short_key}={value_str}\")\n",
    "\n",
    "    # Join all parts with a separator and prepend prefix\n",
    "    folder_name = \"_\".join(parts)\n",
    "\n",
    "    # Replace any potentially problematic characters (if any)\n",
    "    folder_name = folder_name.replace(\":\", \"-\").replace(\"/\", \"-\").replace(\"\\\\\", \"-\")\n",
    "\n",
    "    return folder_name\n",
    "\n",
    "def make_hashable(dict_obj):\n",
    "    return tuple(sorted(dict_obj.items()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-09-25T22:45:11.586042Z",
     "iopub.status.busy": "2024-09-25T22:45:11.586042Z",
     "iopub.status.idle": "2024-09-25T22:45:11.673671Z",
     "shell.execute_reply": "2024-09-25T22:45:11.673118Z"
    }
   },
   "outputs": [],
   "source": [
    "def CatList(cat_, list_, type_ = \"and\"):\n",
    "        if type_ == \"and\":\n",
    "            return cat_ in list_\n",
    "        elif type_ == \"or\":\n",
    "            return cat_ not in list_\n",
    "        else:\n",
    "            return True\n",
    "def cats_used(max_cat_others, data, output_cat, min_cats = 2):\n",
    "    if max_cat_others > 0:\n",
    "        cats_values = (data[output_cat].value_counts()/data.shape[0])*100\n",
    "        sum_perc = 0\n",
    "        cats_ = []\n",
    "        for i in range(len(cats_values)):\n",
    "            if len(cats_) < min_cats or sum_perc < (100-max_cat_others):\n",
    "                cats_.append(str(cats_values.index[i]))\n",
    "                sum_perc = sum_perc + cats_values[i]\n",
    "            else:\n",
    "                break\n",
    "\n",
    "    else:\n",
    "        cats_ = list(pd.unique(data[output_cat].tolist()))\n",
    "\n",
    "    return cats_\n",
    "\n",
    "def cats_levels(data, max_cat_others, output_cat, min_cats,prev_cats= {}):\n",
    "    cats_to_use = []\n",
    "    if len(list(prev_cats.keys())) != 0:\n",
    "        for cat_name in prev_cats.keys():\n",
    "            print(cat_name)\n",
    "            for case in prev_cats[cat_name]:\n",
    "                print(case)\n",
    "                data_cat = data[data[cat_name] == case]\n",
    "                cats_to_use_i = cats_used(max_cat_others, data_cat, output_cat, min_cats)\n",
    "                cats_to_use = cats_to_use + cats_to_use_i\n",
    "    else:\n",
    "        cats_to_use = cats_used(max_cat_others, data, output_cat, min_cats)\n",
    "\n",
    "    return cats_to_use\n",
    "\n",
    "\n",
    "def Cats_Filter(instance, list_categories, name_prev_cat = None):\n",
    "    if name_prev_cat != None:\n",
    "        if instance in list_categories:\n",
    "            return instance\n",
    "        else:\n",
    "            return \"Other_TEIS_\"+name_prev_cat\n",
    "    else:\n",
    "        if instance in list_categories:\n",
    "            return instance\n",
    "        else:\n",
    "            return \"Other_TEIS\"\n",
    "\n",
    "def ManageTextFeature(text_):\n",
    "    if type(text_) != str:\n",
    "        return \"No valid text\"\n",
    "    else:\n",
    "        return text_\n",
    "\n",
    "\n",
    "def classification_report_to_df(report, y_true, y_pred):\n",
    "    global bch_class_df\n",
    "    global topic_dict\n",
    "    global iteration\n",
    "    df = pd.DataFrame(report).transpose()\n",
    "\n",
    "    order_labels = list(topic_dict.values())\n",
    "\n",
    "    # Calculate the confusion matrix\n",
    "    labels = df.index[:-3]  # Exclude 'accuracy', 'macro avg', 'weighted avg'\n",
    "    # Calculate the confusion matrix\n",
    "    cm = confusion_matrix(y_true, y_pred, labels=labels)\n",
    "\n",
    "    # Extracting TP, FP, TN, FN for each class\n",
    "    TP = cm.diagonal()\n",
    "    FP = cm.sum(axis=0) - TP\n",
    "    FN = cm.sum(axis=1) - TP\n",
    "    TN = cm.sum() - (FP + FN + TP)\n",
    "\n",
    "    sens = sum(TP) / (sum(TP)+sum(FN))\n",
    "    spec = sum(TN) / (sum(TN)+sum(FP))\n",
    "    \n",
    "    # Calculate Sensitivity (same as recall)\n",
    "    df['Sensitivity'] = df['recall']\n",
    "    \n",
    "    # Calculate Specificity\n",
    "    tn = cm.sum() - (cm.sum(axis=0) + cm.sum(axis=1) - np.diag(cm))\n",
    "    fp = cm.sum(axis=0) - np.diag(cm)\n",
    "    specificity = tn / (tn + fp)\n",
    "    \n",
    "    # Assign computed specificity to dataframe except for the last three rows\n",
    "    df.loc[df.index[:-3], 'Specificity'] = specificity\n",
    "    \n",
    "    # Handling special cases\n",
    "    # Set 'accuracy' row sensitivity and specificity to the accuracy value\n",
    "    accuracy = df.loc['accuracy', 'precision']  # assuming 'precision' contains the accuracy\n",
    "    df.loc['accuracy', ['Sensitivity', 'Specificity']] = sens, spec\n",
    "    \n",
    "    # Calculate 'macro avg' and 'weighted avg' for sensitivity and specificity\n",
    "    df.loc['macro avg', 'Sensitivity'] = df.iloc[:-3]['Sensitivity'].mean()\n",
    "    df.loc['weighted avg', 'Sensitivity'] = np.average(df.iloc[:-3]['Sensitivity'], weights=df.iloc[:-3]['support'])\n",
    "    \n",
    "    df.loc['macro avg', 'Specificity'] = df.iloc[:-3]['Specificity'].mean()\n",
    "    df.loc['weighted avg', 'Specificity'] = np.average(df.iloc[:-3]['Specificity'], weights=df.iloc[:-3]['support'])\n",
    "\n",
    "    # Calculate Balanced Accuracy for each row, including special averages\n",
    "    df['Balanced Accuracy'] = (df['Sensitivity'] + df['Specificity']) / 2\n",
    "\n",
    "    df.loc['accuracy', 'precision'] = sum(TP) / (sum(TP) + sum(FP))\n",
    "    df.loc['accuracy', 'recall'] = sum(TP) / (sum(TP) + sum(FN))\n",
    "    df.loc['accuracy', 'f1-score'] = 2* sum(TP) / (2 * sum(TP) + sum(FP) + sum(FN))\n",
    "\n",
    "    if iteration > 1:\n",
    "        bch_class_df_noFr = bch_class_df.drop(columns=['TP', 'FP', 'TN', 'FN'])\n",
    "    else: \n",
    "        bch_class_df_noFr = bch_class_df\n",
    "\n",
    "    diff_df = df - bch_class_df_noFr\n",
    "    # Renaming columns for clarity\n",
    "    diff_df.columns = ['Diff ' + col for col in diff_df.columns]\n",
    "\n",
    "    # Concatenating the original dataframe with the differences\n",
    "    combined_df = pd.concat([df, diff_df], axis=1)\n",
    "\n",
    "    class_accuracy = cm.diagonal() / cm.sum(axis=1)\n",
    "    combined_df.loc[labels, 'Accuracy'] = class_accuracy\n",
    "    # Copying f1-score to 'Accuracy' for the last three rows\n",
    "    combined_df.loc[['accuracy', 'macro avg', 'weighted avg'], 'Accuracy'] = combined_df.loc[['accuracy', 'macro avg', 'weighted avg'], 'f1-score']\n",
    "\n",
    "    # Calculate and append TP, FP, TN, FN metrics\n",
    "    metrics_df = pd.DataFrame({\n",
    "        \"TP\": TP,\n",
    "        \"FP\": FP,\n",
    "        \"TN\": TN,\n",
    "        \"FN\": FN\n",
    "    }, index=labels)\n",
    "\n",
    "    # Merge the new metrics into the existing DataFrame\n",
    "    combined_df = combined_df.merge(metrics_df, left_index=True, right_index=True, how='left')\n",
    "    if 'accuracy' in order_labels:\n",
    "        # Reorder DataFrame based on specified order labels\n",
    "        combined_df = combined_df.reindex(order_labels + ['macro avg', 'weighted avg'])\n",
    "    else:\n",
    "        combined_df = combined_df.reindex(order_labels + ['accuracy', 'macro avg', 'weighted avg'])\n",
    "\n",
    "    return combined_df\n",
    "\n",
    "\n",
    "def dominates(score1, score2):\n",
    "    return (score1[0] > score2[0] and score1[1] >= score2[1]) or (score1[0] >= score2[0] and score1[1] > score2[1])\n",
    "\n",
    "def observer_function(all_segments_dict, pca_pair):\n",
    "    global NSW_results_path\n",
    "    global PCA_stats_df_name\n",
    "    global PCA_stats_df\n",
    "    global topic_name\n",
    "    global topic_number\n",
    "    global history_pareto_segments_list\n",
    "    global fold_results_df\n",
    "    global NSW_results_name\n",
    "    global used_pca_pairs\n",
    "    global bch_m0\n",
    "\n",
    "    for i, (key, seg) in enumerate(all_segments_dict.items()):\n",
    "        classification_df = seg['classification_df']\n",
    "\n",
    "        new_row_index = len(fold_results_df)\n",
    "        class_DF_path = f'{NSW_results_path}/Class_DF'\n",
    "        os.makedirs(class_DF_path, exist_ok=True)\n",
    "        classification_df.to_csv(f'{class_DF_path}/{topic_number}_NSW_{new_row_index}_AllEval_ClassDF.csv', index=True)\n",
    "        classification_df.to_pickle(f'{class_DF_path}/{topic_number}_NSW_{new_row_index}_AllEval_ClassDF.pkl')\n",
    "\n",
    "        # Collect all generation data into a new DataFrame row\n",
    "        new_ParamCV_row = {\n",
    "            \"topic_name\": topic_name,\n",
    "            \"topic_number\": topic_number,\n",
    "            \"PCA_index\": used_pca_pairs.index(seg['PCA']),\n",
    "            \"PCA\": seg['PCA'],\n",
    "            \"coordinates\": seg['coordinates'],\n",
    "            'fitness_score': seg['fitness_score'],\n",
    "            \"accuracy\": seg['fitness_score'][0],\n",
    "            \"topic_recall\": seg['fitness_score'][1],\n",
    "            'balanced_fitness_score': (classification_df.loc['accuracy', 'Balanced Accuracy'], classification_df.loc[topic_name, 'Balanced Accuracy']),\n",
    "            'overall_balanced_accuracy': classification_df.loc['accuracy', 'Balanced Accuracy'],\n",
    "            'topic_balanced_accuracy': classification_df.loc[topic_name, 'Balanced Accuracy'],\n",
    "            'balanced_acc_rec_score': (classification_df.loc[topic_name, 'Balanced Accuracy'], classification_df.loc[topic_name, 'recall']),\n",
    "            'topic_F1': classification_df.loc[topic_name, 'f1-score'],\n",
    "            'overall_F1': classification_df.loc['accuracy', 'f1-score'],\n",
    "            'overall_recall': classification_df.loc['accuracy', 'recall'],\n",
    "            \"retraining_time\": seg[\"retraining_time\"],\n",
    "            \"number_of_syn_sample\": seg['number_of_syn_sample'],\n",
    "            \"retrained_dots_list\": seg['retrained_dots_list'],\n",
    "            'true_labels': seg['true_labels'],\n",
    "            'predicted_labels': seg['predicted_labels'],\n",
    "            \"segment_key\": key,\n",
    "            'classDF_path': f'{class_DF_path}/{topic_number}_NSW_{new_row_index}_AllEval_ClassDF.csv',\n",
    "            'T13_TBA_Imp': classification_df.loc['Humanitarian aid for Ukraine.', 'Balanced Accuracy'] - bch_m0.loc['Humanitarian aid for Ukraine.', 'Balanced Accuracy']\n",
    "        }\n",
    "        # # Transform DataFrame to dict format\n",
    "        # for idx in classification_df.index:\n",
    "        #     if idx != 'accuracy' and idx != 'macro avg' and idx != 'weighted avg':\n",
    "        #         new_ParamCV_row[idx] = classification_df.loc[idx].dropna().to_dict()\n",
    "    \n",
    "\n",
    "        # Convert the dictionary to a DataFrame for a single row\n",
    "        new_ParamCV_row_df = pd.DataFrame([new_ParamCV_row])\n",
    "        # Concatenate this new row DataFrame to the existing DataFrame\n",
    "        fold_results_df = pd.concat([fold_results_df, new_ParamCV_row_df], ignore_index=True)\n",
    "        fold_results_df.to_csv(f'{NSW_results_path}/NSWAllSegs_{NSW_results_name}.csv', index=False)\n",
    "        fold_results_df.to_pickle(f'{NSW_results_path}/NSWAllSegs_{NSW_results_name}.pkl')\n",
    "\n",
    "    objective_1_values = [seg['fitness_score'][0] for seg in all_segments_dict.values() if seg['fitness_score'][0] is not None]\n",
    "    objective_2_values = [seg['fitness_score'][1] for seg in all_segments_dict.values() if seg['fitness_score'][1] is not None]\n",
    "\n",
    "    # Check if both lists are empty or filled with zeros\n",
    "    if not objective_1_values or not objective_2_values or all(value == 0 for value in objective_1_values + objective_2_values):\n",
    "        print(\"All objectives are None or 0, skipping further processes.\")\n",
    "        return  # Exit the function\n",
    "\n",
    "    non_dominated_segments = {}\n",
    "    for key1, dict1 in all_segments_dict.items():\n",
    "        if dict1['fitness_score'] == (None, None):\n",
    "            continue  # Skip non-evaluable segments\n",
    "\n",
    "        dominated = False\n",
    "        for key2, dict2 in all_segments_dict.items():\n",
    "            if key1 != key2 and dominates(dict2['fitness_score'], dict1['fitness_score']):\n",
    "                dominated = True\n",
    "                break\n",
    "        if not dominated:\n",
    "            non_dominated_segments[key1] = dict1\n",
    "\n",
    "    pareto_fitness_tuples = [(ft[0], ft[1]) for ft in [seg['fitness_score'] for seg in non_dominated_segments.values()] if ft[0] is not None and ft[1] is not None]\n",
    "    pareto_segments_tuples = [(seg['coordinates'], seg['number_of_syn_sample'], seg['retrained_dots_list']) for seg in non_dominated_segments.values()]\n",
    "\n",
    "    # Calculate statistics\n",
    "    worst_fitness = (min(objective_1_values), min(objective_2_values))\n",
    "    best_fitness = (max(objective_1_values), max(objective_2_values))\n",
    "    mean_OverallAcc = np.mean(objective_1_values)\n",
    "    mean_ClassRecall = np.mean(objective_2_values)\n",
    "\n",
    "    print(f\"PCA: {pca_pair}, Evaluations: {len(all_segments_dict)}\")\n",
    "    print(f\"Best Fitness: {best_fitness}\")\n",
    "    print(f\"Worst Fitness: {worst_fitness}\")\n",
    "    print(f\"Mean Overall Accuracy: {mean_OverallAcc}\")\n",
    "    print(f\"Mean {topic_name} Recall: {mean_ClassRecall}\")\n",
    "\n",
    "    print(\"Pareto Front Selections:---------------------\")\n",
    "    for i, (key, seg) in enumerate(non_dominated_segments.items()):\n",
    "        print(f\"Segment {i+1}: {seg['coordinates']}, Num Samples {seg['number_of_syn_sample']}: {seg['retrained_dots_list']} \\nFitness: {seg['fitness_score']}\")\n",
    "        history_pareto_segments_list.append(key)\n",
    "        print('---')\n",
    "    print('------------------------')\n",
    "\n",
    "    # Plotting\n",
    "    plt.figure(figsize=(12, 7))\n",
    "    plt.scatter(objective_1_values, objective_2_values, c='blue', alpha=0.5, label='Population')\n",
    "    if pareto_fitness_tuples:\n",
    "        plt.scatter([ft[0] for ft in pareto_fitness_tuples], [ft[1] for ft in pareto_fitness_tuples], c='red', alpha=0.9, label='Pareto Front')\n",
    "    \n",
    "    plt.xlabel('Objective 1: Overall Accuracy')\n",
    "    plt.ylabel(f'Objective 2: {topic_name} Recall')\n",
    "    plt.title(f'{topic_number}, Population and Pareto Front at PCA {pca_pair}')\n",
    "    plt.legend()\n",
    "    pareto_plots_dir = f\"{NSW_results_path}/{PCA_stats_df_name}\"\n",
    "    os.makedirs(pareto_plots_dir, exist_ok=True)\n",
    "    filename = f\"{pca_pair[0]}{pca_pair[1]}.png\"\n",
    "    plt.savefig(os.path.join(pareto_plots_dir, filename), dpi=200, bbox_inches='tight', pad_inches=0)\n",
    "    plt.show()\n",
    "\n",
    "    recall_key = f\"Mean {topic_name} Recall\"\n",
    "    # Collect all generation data into a new DataFrame row\n",
    "    new_row = {\n",
    "        \"PCA\": f\"{pca_pair[0]}{pca_pair[1]}\",\n",
    "        \"Number of Evaluations\": len(all_segments_dict),\n",
    "        \"Best Fitness\": best_fitness,\n",
    "        \"Worst Fitness\": worst_fitness,\n",
    "        \"Mean Overall Accuracy\": mean_OverallAcc,\n",
    "        recall_key: mean_ClassRecall,\n",
    "        \"Pareto Front Segments\": [pareto_segments_tuples, pareto_fitness_tuples]\n",
    "    }\n",
    "    # Convert the dictionary to a DataFrame for a single row\n",
    "    new_row_df = pd.DataFrame([new_row])\n",
    "    # Concatenate this new row DataFrame to the existing DataFrame\n",
    "    PCA_stats_df = pd.concat([PCA_stats_df, new_row_df], ignore_index=True)\n",
    "    PCA_stats_df.to_csv(f'{NSW_results_path}/{PCA_stats_df_name}/{pca_pair[0]}{pca_pair[1]}.csv', index=False)\n",
    "    PCA_stats_df.to_pickle(f'{NSW_results_path}/{PCA_stats_df_name}/{pca_pair[0]}{pca_pair[1]}.pkl')\n",
    "\n",
    "\n",
    "def final_pareto_observer(history_pareto_segments_list):\n",
    "    global history_segments_dict_allPCA\n",
    "    global topic_name\n",
    "    global topic_number\n",
    "    global NSW_results_path\n",
    "    global PCA_stats_df_name\n",
    "    global level_stats_df\n",
    "    global fold_pfs_df\n",
    "    global NSW_results_name\n",
    "    global used_pca_pairs\n",
    "    global PCA_stats_df\n",
    "\n",
    "    objective_1_values = [history_segments_dict_allPCA[seg_key]['fitness_score'][0] for seg_key in history_pareto_segments_list if history_segments_dict_allPCA[seg_key]['fitness_score'][0] is not None]\n",
    "    objective_2_values = [history_segments_dict_allPCA[seg_key]['fitness_score'][1] for seg_key in history_pareto_segments_list if history_segments_dict_allPCA[seg_key]['fitness_score'][1] is not None]\n",
    "\n",
    "    # Check if both lists are empty or filled with zeros\n",
    "    if not objective_1_values or not objective_2_values or all(value == 0 for value in objective_1_values + objective_2_values):\n",
    "        print(\"All objectives are None or 0, skipping further processes.\")\n",
    "        return  # Exit the function\n",
    "\n",
    "    non_dominated_segments = {}\n",
    "    for i1, key1 in enumerate(history_pareto_segments_list):\n",
    "        if history_segments_dict_allPCA[key1]['fitness_score'] == (None, None):\n",
    "            continue  # Skip non-evaluable segments\n",
    "\n",
    "        dominated = False\n",
    "        for i2, key2 in enumerate(history_pareto_segments_list):\n",
    "            if i1 != i2 and dominates(history_segments_dict_allPCA[key2]['fitness_score'], history_segments_dict_allPCA[key1]['fitness_score']):\n",
    "                dominated = True\n",
    "                break\n",
    "        if not dominated:\n",
    "            non_dominated_segments[key1] = history_segments_dict_allPCA[key1]\n",
    "\n",
    "    pareto_fitness_tuples = [(ft[0], ft[1]) for ft in [history_segments_dict_allPCA[seg_key]['fitness_score'] for seg_key in non_dominated_segments.keys()] if ft[0] is not None and ft[1] is not None]\n",
    "    pareto_segments_tuples = [(seg_key[2], seg_key[3], history_segments_dict_allPCA[seg_key]['number_of_syn_sample'], history_segments_dict_allPCA[seg_key]['retrained_dots_list']) for seg_key in non_dominated_segments.keys()]\n",
    "\n",
    "    # Calculate statistics\n",
    "    worst_fitness = (min(objective_1_values), min(objective_2_values))\n",
    "    best_fitness = (max(objective_1_values), max(objective_2_values))\n",
    "    mean_OverallAcc = np.mean(objective_1_values)\n",
    "    mean_ClassRecall = np.mean(objective_2_values)\n",
    "\n",
    "    print(f\"All Pareto Segments in History, Evaluations: {len(history_pareto_segments_list)}\")\n",
    "    print(f\"Best Fitness: {best_fitness}\")\n",
    "    print(f\"Worst Fitness: {worst_fitness}\")\n",
    "    print(f\"Mean Overall Accuracy: {mean_OverallAcc}\")\n",
    "    print(f\"Mean {topic_name} Recall: {mean_ClassRecall}\")\n",
    "\n",
    "    print(\"Pareto Front Selections:---------------------\")\n",
    "    for i, (key, seg) in enumerate(non_dominated_segments.items()):\n",
    "        print(f\"Segment {i+1}: {seg['coordinates']}, {seg['PCA']}, Num Samples {seg['number_of_syn_sample']}: {seg['retrained_dots_list']} \\nFitness: {seg['fitness_score']}\")\n",
    "\n",
    "        classification_df = seg['classification_df']\n",
    "        # Collect all generation data into a new DataFrame row\n",
    "        new_PFs_row = {\n",
    "            \"topic_name\": topic_name,\n",
    "            \"topic_number\": topic_number,\n",
    "            \"PCA_index\": used_pca_pairs.index(seg['PCA']),\n",
    "            \"PCA\": seg['PCA'],\n",
    "            \"coordinates\": seg['coordinates'],\n",
    "            'fitness_score': seg['fitness_score'],\n",
    "            \"accuracy\": seg['fitness_score'][0],\n",
    "            \"topic_recall\": seg['fitness_score'][1],\n",
    "            \"retraining_time\": seg[\"retraining_time\"],\n",
    "            \"number_of_syn_sample\": seg['number_of_syn_sample'],\n",
    "            \"retrained_dots_list\": seg['retrained_dots_list'],\n",
    "            'true_labels': seg['true_labels'],\n",
    "            'predicted_labels': seg['predicted_labels'],\n",
    "            \"segment_key\": key\n",
    "        }\n",
    "        # # Transform DataFrame to dict format\n",
    "        # for idx in classification_df.index:\n",
    "        #     if idx != 'accuracy' and idx != 'macro avg' and idx != 'weighted avg':\n",
    "        #         new_PFs_row[idx] = classification_df.loc[idx].dropna().to_dict()\n",
    "\n",
    "        # Convert the dictionary to a DataFrame for a single row\n",
    "        new_PFs_row_df = pd.DataFrame([new_PFs_row])\n",
    "        # Concatenate this new row DataFrame to the existing DataFrame\n",
    "        fold_pfs_df = pd.concat([fold_pfs_df, new_PFs_row_df], ignore_index=True)\n",
    "        fold_pfs_df.to_csv(f'{NSW_results_path}/NSWPFs_{NSW_results_name}.csv', index=False)\n",
    "        fold_pfs_df.to_pickle(f'{NSW_results_path}/NSWPFs_{NSW_results_name}.pkl')\n",
    "        print('---')\n",
    "    print('------------------------')\n",
    "\n",
    "    # Plotting\n",
    "    plt.figure(figsize=(12, 7))\n",
    "    plt.scatter(objective_1_values, objective_2_values, c='blue', alpha=0.5, label='Population')\n",
    "    if pareto_fitness_tuples:\n",
    "        plt.scatter([ft[0] for ft in pareto_fitness_tuples], [ft[1] for ft in pareto_fitness_tuples], c='red', alpha=0.9, label='Pareto Front')\n",
    "    \n",
    "    plt.xlabel('Objective 1: Overall Accuracy')\n",
    "    plt.ylabel(f'Objective 2: {topic_name} Recall')\n",
    "    plt.title(f'{topic_number}, Population and Pareto Front for All Pareto Segments in History')\n",
    "    plt.legend()\n",
    "    pareto_plots_dir = f\"{NSW_results_path}/{PCA_stats_df_name}\"\n",
    "    os.makedirs(pareto_plots_dir, exist_ok=True)\n",
    "    filename = f\"All Pareto Segments in History.png\"\n",
    "    plt.savefig(os.path.join(pareto_plots_dir, filename), dpi=200, bbox_inches='tight', pad_inches=0)\n",
    "    plt.show()\n",
    "\n",
    "    recall_key = f\"Mean {topic_name} Recall\"\n",
    "    # Collect all generation data into a new DataFrame row\n",
    "    new_row = {\n",
    "        \"PCA\": f\"All PCAs\",\n",
    "        \"Number of Evaluations\": len(history_pareto_segments_list),\n",
    "        \"Best Fitness\": best_fitness,\n",
    "        \"Worst Fitness\": worst_fitness,\n",
    "        \"Mean Overall Accuracy\": mean_OverallAcc,\n",
    "        recall_key: mean_ClassRecall,\n",
    "        \"Pareto Front Segments\": [pareto_segments_tuples, pareto_fitness_tuples]\n",
    "    }\n",
    "    # Convert the dictionary to a DataFrame for a single row\n",
    "    new_row_df = pd.DataFrame([new_row])\n",
    "    # Concatenate this new row DataFrame to the existing DataFrame\n",
    "    PCA_stats_df = pd.concat([PCA_stats_df, new_row_df], ignore_index=True)\n",
    "    PCA_stats_df.to_csv(f'{NSW_results_path}/{PCA_stats_df_name}/All Pareto Segments in History.csv', index=False)\n",
    "    PCA_stats_df.to_pickle(f'{NSW_results_path}/{PCA_stats_df_name}/All Pareto Segments in History.pkl')\n",
    "\n",
    "\n",
    "def pca_pareto_observer(history_pareto_segments_list, pca_pair):\n",
    "    global history_segments_dict_allPCA\n",
    "    global topic_name\n",
    "    global topic_number\n",
    "    global NSW_results_path\n",
    "    global NSW_results_name\n",
    "    global PCA_stats_df_name\n",
    "    global pca_pfs_df\n",
    "    global used_pca_pairs\n",
    "\n",
    "    # Filter the list to include only tuples where the (pca1, pca2) matches the input pca_pair\n",
    "    filtered_list = [item for item in history_pareto_segments_list if item[2] == pca_pair]\n",
    "\n",
    "    objective_1_values = [history_segments_dict_allPCA[seg_key]['fitness_score'][0] for seg_key in filtered_list if history_segments_dict_allPCA[seg_key]['fitness_score'][0] is not None]\n",
    "    objective_2_values = [history_segments_dict_allPCA[seg_key]['fitness_score'][1] for seg_key in filtered_list if history_segments_dict_allPCA[seg_key]['fitness_score'][1] is not None]\n",
    "\n",
    "    # Check if both lists are empty or filled with zeros\n",
    "    if not objective_1_values or not objective_2_values or all(value == 0 for value in objective_1_values + objective_2_values):\n",
    "        print(\"All objectives are None or 0, skipping further processes.\")\n",
    "        return  # Exit the function\n",
    "\n",
    "    non_dominated_segments = {}\n",
    "    for i1, key1 in enumerate(filtered_list):\n",
    "        if history_segments_dict_allPCA[key1]['fitness_score'] == (None, None):\n",
    "            continue  # Skip non-evaluable segments\n",
    "\n",
    "        dominated = False\n",
    "        for i2, key2 in enumerate(filtered_list):\n",
    "            if i1 != i2 and dominates(history_segments_dict_allPCA[key2]['fitness_score'], history_segments_dict_allPCA[key1]['fitness_score']):\n",
    "                dominated = True\n",
    "                break\n",
    "        if not dominated:\n",
    "            non_dominated_segments[key1] = history_segments_dict_allPCA[key1]\n",
    "\n",
    "    pareto_fitness_tuples = [(ft[0], ft[1]) for ft in [history_segments_dict_allPCA[seg_key]['fitness_score'] for seg_key in non_dominated_segments.keys()] if ft[0] is not None and ft[1] is not None]\n",
    "    # pareto_segments_tuples = [(seg_key[2], seg_key[3], history_segments_dict_allPCA[seg_key]['number_of_syn_sample'], history_segments_dict_allPCA[seg_key]['retrained_dots_list']) for seg_key in non_dominated_segments.keys()]\n",
    "\n",
    "    # Calculate statistics\n",
    "    worst_fitness = (min(objective_1_values), min(objective_2_values))\n",
    "    best_fitness = (max(objective_1_values), max(objective_2_values))\n",
    "    mean_OverallAcc = np.mean(objective_1_values)\n",
    "    mean_ClassRecall = np.mean(objective_2_values)\n",
    "\n",
    "    print(f\"All Pareto Segments in {pca_pair}, Evaluations: {len(filtered_list)}\")\n",
    "    print(f\"Best Fitness: {best_fitness}\")\n",
    "    print(f\"Worst Fitness: {worst_fitness}\")\n",
    "    print(f\"Mean Overall Accuracy: {mean_OverallAcc}\")\n",
    "    print(f\"Mean {topic_name} Recall: {mean_ClassRecall}\")\n",
    "\n",
    "    print(\"Pareto Front Selections:---------------------\")\n",
    "    for i, (key, seg) in enumerate(non_dominated_segments.items()):\n",
    "        print(f\"Segment {i+1}: {seg['coordinates']}, {seg['PCA']}, Num Samples {seg['number_of_syn_sample']}: {seg['retrained_dots_list']} \\nFitness: {seg['fitness_score']}\")\n",
    "        \n",
    "        classification_df = seg['classification_df']\n",
    "        # Collect all generation data into a new DataFrame row\n",
    "        pca_PFs_row = {\n",
    "            \"topic_name\": topic_name,\n",
    "            \"topic_number\": topic_number,\n",
    "            \"PCA_index\": used_pca_pairs.index(seg['PCA']),\n",
    "            \"PCA\": seg['PCA'],\n",
    "            \"coordinates\": seg['coordinates'],\n",
    "            'fitness_score': seg['fitness_score'],\n",
    "            \"accuracy\": seg['fitness_score'][0],\n",
    "            \"topic_recall\": seg['fitness_score'][1],\n",
    "            'balanced_fitness_score': (classification_df.loc['accuracy', 'Balanced Accuracy'], classification_df.loc[topic_name, 'Balanced Accuracy']),\n",
    "            'overall_balanced_accuracy': classification_df.loc['accuracy', 'Balanced Accuracy'],\n",
    "            'topic_balanced_accuracy': classification_df.loc[topic_name, 'Balanced Accuracy'],\n",
    "            'balanced_acc_rec_score': (classification_df.loc[topic_name, 'Balanced Accuracy'], classification_df.loc[topic_name, 'recall']),\n",
    "            'topic_F1': classification_df.loc[topic_name, 'f1-score'],\n",
    "            'overall_F1': classification_df.loc['accuracy', 'f1-score'],\n",
    "            'overall_recall': classification_df.loc['accuracy', 'recall'],\n",
    "            \"retraining_time\": seg[\"retraining_time\"],\n",
    "            \"number_of_syn_sample\": seg['number_of_syn_sample'],\n",
    "            \"retrained_dots_list\": seg['retrained_dots_list'],\n",
    "            'true_labels': seg['true_labels'],\n",
    "            'predicted_labels': seg['predicted_labels'],\n",
    "            \"segment_key\": key\n",
    "        }\n",
    "        # # Transform DataFrame to dict format\n",
    "        # for idx in classification_df.index:\n",
    "        #     if idx != 'accuracy' and idx != 'macro avg' and idx != 'weighted avg':\n",
    "        #         pca_PFs_row[idx] = classification_df.loc[idx].dropna().to_dict()\n",
    "        # Convert the dictionary to a DataFrame for a single row\n",
    "        pca_PFs_row_df = pd.DataFrame([pca_PFs_row])\n",
    "        # Concatenate this new row DataFrame to the existing DataFrame\n",
    "        pca_pfs_df = pd.concat([pca_pfs_df, pca_PFs_row_df], ignore_index=True)\n",
    "        pca_pfs_df.to_csv(f'{NSW_results_path}/NSWpcaPFs_{NSW_results_name}.csv', index=False)\n",
    "        pca_pfs_df.to_pickle(f'{NSW_results_path}/NSWpcaPFs_{NSW_results_name}.pkl')\n",
    "        print('---')\n",
    "    print('------------------------')\n",
    "\n",
    "    # Plotting\n",
    "    plt.figure(figsize=(12, 7))\n",
    "    plt.scatter(objective_1_values, objective_2_values, c='blue', alpha=0.5, label='Population')\n",
    "    if pareto_fitness_tuples:\n",
    "        plt.scatter([ft[0] for ft in pareto_fitness_tuples], [ft[1] for ft in pareto_fitness_tuples], c='red', alpha=0.9, label='Pareto Front')\n",
    "    \n",
    "    plt.xlabel('Objective 1: Overall Accuracy')\n",
    "    plt.ylabel(f'Objective 2: {topic_name} Recall')\n",
    "    plt.title(f'{topic_number}, Population and Pareto Front for All Pareto Segments in {pca_pair}')\n",
    "    plt.legend()\n",
    "    pareto_plots_dir = f\"{NSW_results_path}/{PCA_stats_df_name}\"\n",
    "    os.makedirs(pareto_plots_dir, exist_ok=True)\n",
    "    filename = f\"All Pareto Segments in {pca_pair}.png\"\n",
    "    plt.savefig(os.path.join(pareto_plots_dir, filename), dpi=200, bbox_inches='tight', pad_inches=0)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-09-25T22:45:11.680656Z",
     "iopub.status.busy": "2024-09-25T22:45:11.679654Z",
     "iopub.status.idle": "2024-09-25T22:45:11.721734Z",
     "shell.execute_reply": "2024-09-25T22:45:11.720732Z"
    }
   },
   "outputs": [],
   "source": [
    "def segment_retraining(data_syn, individual_Segment_dict, segment_key):\n",
    "    global dots_mode\n",
    "    global history_segments_dict\n",
    "    global X_train_r\n",
    "    global Y_train_r\n",
    "    global X_test_re\n",
    "    global Y_test_re\n",
    "    global catboost_params\n",
    "    global sum_GPU_seconds\n",
    "    global total_gpu_seconds\n",
    "    global GPU_limit\n",
    "    global X_test_re_Test\n",
    "    global Y_test_re_Test\n",
    "    global test_fold_results_df\n",
    "    global bch_m0\n",
    "    global NSW_results_path\n",
    "    global used_pca_pairs\n",
    "    global topic_name\n",
    "    global topic_number\n",
    "    global NSW_results_name\n",
    "\n",
    "    CPU_monitor_memory_usage()\n",
    "    monitor_gpu_memory()\n",
    "\n",
    "    if dots_mode == \"False\":\n",
    "        syn_original_list = individual_Segment_dict[\"red_dots_list\"]\n",
    "    elif dots_mode == \"Both\":\n",
    "        syn_original_list = individual_Segment_dict[\"red_dots_list\"] + individual_Segment_dict[\"blue_dots_list\"]\n",
    "\n",
    "    if len(syn_original_list) == 0:\n",
    "        individual_Segment_dict['model'] = None\n",
    "        individual_Segment_dict['true_labels'] = None\n",
    "        individual_Segment_dict['predicted_labels'] = None\n",
    "        individual_Segment_dict['classification_df'] = None\n",
    "        individual_Segment_dict['fitness_score'] = (None, None)\n",
    "        individual_Segment_dict['number_of_syn_sample'] = None\n",
    "        individual_Segment_dict['retraining_time'] = None\n",
    "        individual_Segment_dict['retrained_dots_list'] = []\n",
    "        return individual_Segment_dict\n",
    "\n",
    "    for previous_segment_key, previous_Segment_dict in history_segments_dict.items():\n",
    "        if syn_original_list == previous_Segment_dict['retrained_dots_list']:\n",
    "            individual_Segment_dict['model'] = previous_Segment_dict['model']\n",
    "            individual_Segment_dict['true_labels'] = previous_Segment_dict['true_labels']\n",
    "            individual_Segment_dict['predicted_labels'] = previous_Segment_dict['predicted_labels']\n",
    "            individual_Segment_dict['classification_df'] = previous_Segment_dict['classification_df']\n",
    "            individual_Segment_dict['fitness_score'] = previous_Segment_dict['fitness_score']\n",
    "            individual_Segment_dict['number_of_syn_sample'] = previous_Segment_dict['number_of_syn_sample']\n",
    "            individual_Segment_dict['retraining_time'] = previous_Segment_dict['retraining_time']\n",
    "            individual_Segment_dict['retrained_dots_list'] = syn_original_list\n",
    "            return individual_Segment_dict\n",
    "    \n",
    "    filtered_syn_df = data_syn[data_syn['index_meta'].isin(syn_original_list)]\n",
    "\n",
    "    X_train_re = pd.concat([X_train_r, filtered_syn_df.drop(columns=['topic_name'])])\n",
    "    Y_train_re = pd.concat([Y_train_r, filtered_syn_df['topic_name']])\n",
    "\n",
    "    train_pool_re = Pool(\n",
    "        X_train_re[[\"text\", \"area_TEIS\"]],\n",
    "        Y_train_re,\n",
    "        text_features=[\"text\"],\n",
    "        cat_features=[\"area_TEIS\"]\n",
    "    )\n",
    "    valid_pool_re = Pool(\n",
    "        X_test_re[[\"text\", \"area_TEIS\"]],\n",
    "        Y_test_re,\n",
    "        text_features=[\"text\"],\n",
    "        cat_features=[\"area_TEIS\"]\n",
    "    )\n",
    "\n",
    "    catboost_params = catboost_params\n",
    "            \n",
    "    # Model Training\n",
    "    model_re = CatBoostClassifier(**catboost_params)\n",
    "    start_time = time.time()  # Start timing\n",
    "    model_re.fit(train_pool_re, eval_set=valid_pool_re)\n",
    "    training_time = time.time() - start_time  # End timing\n",
    "\n",
    "    sum_GPU_seconds += training_time\n",
    "    if sum_GPU_seconds >= total_gpu_seconds:\n",
    "        GPU_limit = True\n",
    "\n",
    "    # Save the retrain performances\n",
    "    predictions = model_re.predict(X_test_re[[\"text\", \"area_TEIS\"]])\n",
    "    accuracy = accuracy_score(Y_test_re, predictions)\n",
    "    report = classification_report(Y_test_re, predictions, digits=3, output_dict=True)\n",
    "    classification_df = classification_report_to_df(report, Y_test_re, predictions)\n",
    "    \n",
    "    fitness_score = (accuracy, classification_df.loc[topic_name, 'recall'])\n",
    "            \n",
    "    # Save the trained model, classification_df, and fitness_score\n",
    "    individual_Segment_dict['model'] = model_re\n",
    "    individual_Segment_dict['true_labels'] = []  # Convert to list if Y_test_re is a pandas Series or numpy array\n",
    "    individual_Segment_dict['predicted_labels'] = []  # Convert to list for consistency\n",
    "    individual_Segment_dict['classification_df'] = classification_df\n",
    "    individual_Segment_dict['fitness_score'] = fitness_score\n",
    "    individual_Segment_dict['number_of_syn_sample'] = len(filtered_syn_df)\n",
    "    individual_Segment_dict['retraining_time'] = training_time  # Save the training time\n",
    "    individual_Segment_dict['retrained_dots_list'] = syn_original_list\n",
    "\n",
    "    \"\"\"Testing results below\"\"\"\n",
    "\n",
    "    test_predictions = model_re.predict(X_test_re_Test[[\"text\", \"area_TEIS\"]])\n",
    "    test_accuracy = accuracy_score(Y_test_re_Test, test_predictions)\n",
    "    test_report = classification_report(Y_test_re_Test, test_predictions, digits=3, output_dict=True)\n",
    "    test_classification_df = classification_report_to_df(test_report, Y_test_re_Test, test_predictions)\n",
    "\n",
    "    test_new_row_index = len(test_fold_results_df)\n",
    "    test_class_DF_path = f'{NSW_results_path}/test_Class_DF'\n",
    "    os.makedirs(test_class_DF_path, exist_ok=True)\n",
    "    test_classification_df.to_csv(f'{test_class_DF_path}/test_{topic_number}_NSW_{test_new_row_index}_AllEval_ClassDF.csv', index=True)\n",
    "    test_classification_df.to_pickle(f'{test_class_DF_path}/test_{topic_number}_NSW_{test_new_row_index}_AllEval_ClassDF.pkl')\n",
    "\n",
    "    # Collect all generation data into a new DataFrame row\n",
    "    test_new_ParamCV_row = {\n",
    "        \"topic_name\": topic_name,\n",
    "        \"topic_number\": topic_number,\n",
    "        \"PCA_index\": used_pca_pairs.index(individual_Segment_dict['PCA']),\n",
    "        \"PCA\": individual_Segment_dict['PCA'],\n",
    "        \"coordinates\": individual_Segment_dict['coordinates'],\n",
    "        'fitness_score': individual_Segment_dict['fitness_score'],\n",
    "        \"accuracy\": individual_Segment_dict['fitness_score'][0],\n",
    "        \"topic_recall\": individual_Segment_dict['fitness_score'][1],\n",
    "        'balanced_fitness_score': (test_classification_df.loc['accuracy', 'Balanced Accuracy'], test_classification_df.loc[topic_name, 'Balanced Accuracy']),\n",
    "        'overall_balanced_accuracy': test_classification_df.loc['accuracy', 'Balanced Accuracy'],\n",
    "        'topic_balanced_accuracy': test_classification_df.loc[topic_name, 'Balanced Accuracy'],\n",
    "        'balanced_acc_rec_score': (test_classification_df.loc[topic_name, 'Balanced Accuracy'], test_classification_df.loc[topic_name, 'recall']),\n",
    "        'topic_F1': test_classification_df.loc[topic_name, 'f1-score'],\n",
    "        'overall_F1': test_classification_df.loc['accuracy', 'f1-score'],\n",
    "        'overall_recall': test_classification_df.loc['accuracy', 'recall'],\n",
    "        \"retraining_time\": individual_Segment_dict[\"retraining_time\"],\n",
    "        \"number_of_syn_sample\": individual_Segment_dict['number_of_syn_sample'],\n",
    "        \"retrained_dots_list\": individual_Segment_dict['retrained_dots_list'],\n",
    "        'true_labels': individual_Segment_dict['true_labels'],\n",
    "        'predicted_labels': individual_Segment_dict['predicted_labels'],\n",
    "        \"segment_key\": (topic_name, topic_number, individual_Segment_dict['PCA'], individual_Segment_dict['coordinates']),\n",
    "        'classDF_path': f'{test_class_DF_path}/test_{topic_number}_NSW_{test_new_row_index}_AllEval_ClassDF.csv',\n",
    "        'T13_TBA_Imp': test_classification_df.loc['Humanitarian aid for Ukraine.', 'Balanced Accuracy'] - bch_m0.loc['Humanitarian aid for Ukraine.', 'Balanced Accuracy'],\n",
    "        'T13_TR_Imp': classification_df.loc['Humanitarian aid for Ukraine.', 'recall'] - bch_m0.loc['Humanitarian aid for Ukraine.', 'recall']\n",
    "    }\n",
    "    # # Transform DataFrame to dict format\n",
    "    # for idx in classification_df.index:\n",
    "    #     if idx != 'accuracy' and idx != 'macro avg' and idx != 'weighted avg':\n",
    "    #         new_ParamCV_row[idx] = classification_df.loc[idx].dropna().to_dict()\n",
    "\n",
    "\n",
    "    # Convert the dictionary to a DataFrame for a single row\n",
    "    test_new_ParamCV_row_df = pd.DataFrame([test_new_ParamCV_row])\n",
    "    # Concatenate this new row DataFrame to the existing DataFrame\n",
    "    test_fold_results_df = pd.concat([test_fold_results_df, test_new_ParamCV_row_df], ignore_index=True)\n",
    "    test_fold_results_df.to_csv(f'{NSW_results_path}/test_NSWAllSegs_{NSW_results_name}.csv', index=False)\n",
    "    test_fold_results_df.to_pickle(f'{NSW_results_path}/test_NSWAllSegs_{NSW_results_name}.pkl')\n",
    "\n",
    "    \n",
    "\n",
    "    return individual_Segment_dict\n",
    "\n",
    "\n",
    "def sliding_window_get_dots(history_segments_dict, tr_df, data_syn, topic_name, topic_number, area_size_x, area_size_y, level_x_start, level_y_start, pca_pairs = [(\"PCA_0\", \"PCA_1\")], num_segments=16, window_size=1):\n",
    "    global sw_level\n",
    "    global NSW_results_path\n",
    "    global all_segments_dict_name\n",
    "    global GPU_limit\n",
    "\n",
    "    for pca1, pca2 in pca_pairs:\n",
    "        if GPU_limit == True:\n",
    "            return history_segments_dict\n",
    "        # Separate data points by their classification status\n",
    "        grey_mask = tr_df['topic_name'] != topic_name\n",
    "        blue_mask = (tr_df['topic_name'] == topic_name) & (tr_df['pred_topic_name'] == topic_name)\n",
    "        red_mask = (tr_df['topic_name'] == topic_name) & (tr_df['pred_topic_name'] != topic_name)\n",
    "\n",
    "        if area_size_x == \"FullSize\":\n",
    "            segment_size_x = (tr_df[pca1].max() - tr_df[pca1].min()) / num_segments\n",
    "        else:\n",
    "            segment_size_x = (area_size_x[1] - area_size_x[0]) / num_segments\n",
    "        if area_size_y == \"FullSize\":\n",
    "            segment_size_y = (tr_df[pca2].max() - tr_df[pca2].min()) / num_segments\n",
    "        else:\n",
    "            segment_size_y = (area_size_y[1] - area_size_y[0]) / num_segments\n",
    "\n",
    "        # Sliding window through the plot area\n",
    "        for i in range(num_segments - window_size + 1):\n",
    "            for j in range(num_segments - window_size + 1):\n",
    "                if GPU_limit == True:\n",
    "                    return history_segments_dict\n",
    "                individual_Segment_dict = {}\n",
    "                \n",
    "                if level_x_start == \"minimum\":\n",
    "                    x_start = tr_df[pca1].min() + i * segment_size_x\n",
    "                else:\n",
    "                    x_start = level_x_start + i * segment_size_x\n",
    "                if level_y_start == \"minimum\":\n",
    "                    y_start = tr_df[pca2].min() + j * segment_size_y\n",
    "                else:\n",
    "                    y_start = level_y_start + j * segment_size_y\n",
    "                \n",
    "                x_end = x_start + window_size * segment_size_x\n",
    "                y_end = y_start + window_size * segment_size_y\n",
    "\n",
    "                coordinates = f\"{x_start}-{x_end}, {y_start}-{y_end}\"\n",
    "                segment_key = (topic_name, topic_number, (pca1, pca2), coordinates)\n",
    "\n",
    "                # Check if a segment with these specific details already exists\n",
    "                if any(key == segment_key for key in history_segments_dict.keys()):\n",
    "                    continue  # Skip this iteration if a match is found\n",
    "\n",
    "                segment_mask = (tr_df[pca1] >= x_start) & (tr_df[pca1] <= x_end) & (tr_df[pca2] >= y_start) & (tr_df[pca2] <= y_end)\n",
    "                segment_red_mask = segment_mask & red_mask\n",
    "                segment_blue_mask = segment_mask & blue_mask\n",
    "                segment_grey_mask =  segment_mask & grey_mask\n",
    "                \n",
    "                individual_Segment_dict[\"coordinates\"] = coordinates\n",
    "                individual_Segment_dict[\"PCA\"] = (pca1, pca2)\n",
    "                individual_Segment_dict[\"blue_dots_list\"] = tr_df.loc[segment_blue_mask, 'index_meta'].tolist()\n",
    "                individual_Segment_dict[\"red_dots_list\"] = tr_df.loc[segment_red_mask, 'index_meta'].tolist()\n",
    "                individual_Segment_dict[\"grey_dots_list\"] = tr_df.loc[segment_grey_mask, 'index_meta'].tolist()\n",
    "\n",
    "                individual_Segment_dict = segment_retraining(data_syn, individual_Segment_dict, segment_key)\n",
    "\n",
    "                if individual_Segment_dict['fitness_score'] != (None, None):\n",
    "                    history_segments_dict[segment_key] = individual_Segment_dict\n",
    "                    print(individual_Segment_dict['fitness_score'])\n",
    "                    \n",
    "                    os.makedirs(f'{NSW_results_path}/{all_segments_dict_name}', exist_ok=True)\n",
    "                    with open(f'{NSW_results_path}/{all_segments_dict_name}/{pca1}{pca2}.pkl', 'wb') as file:\n",
    "                        pickle.dump(history_segments_dict, file)\n",
    "    \n",
    "        print(f'Finished hierarchical sliding window for {topic_name}_{pca1}_{pca2}.')\n",
    "    return history_segments_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-09-25T22:45:11.725738Z",
     "iopub.status.busy": "2024-09-25T22:45:11.725738Z",
     "iopub.status.idle": "2024-09-25T22:45:11.753507Z",
     "shell.execute_reply": "2024-09-25T22:45:11.752496Z"
    }
   },
   "outputs": [],
   "source": [
    "def dominates(score1, score2):\n",
    "    \"\"\"\n",
    "    Determines if one score dominates another.\n",
    "    A score1 dominates score2 if it is better in all the objectives or equal in some and better in at least one.\n",
    "    \"\"\"\n",
    "    return (score1[0] > score2[0] and score1[1] >= score2[1]) or (score1[0] >= score2[0] and score1[1] > score2[1])\n",
    "\n",
    "def find_pareto_front(df):\n",
    "    \"\"\"\n",
    "    Marks rows as 'Yes' if they are on the Pareto front, 'No' otherwise.\n",
    "    \"\"\"\n",
    "    df = df.copy()  # Copy DataFrame to avoid modifying the original\n",
    "    df['Pareto'] = 'No'  # Initialize the Pareto column with 'No'\n",
    "    \n",
    "    scores = df['balanced_acc_rec_score'].tolist()\n",
    "    is_pareto = np.ones(len(scores), dtype=bool)  # Initialize all as True\n",
    "    \n",
    "    for i1 in range(len(scores)):\n",
    "        for i2 in range(len(scores)):\n",
    "            if i1 != i2 and dominates(scores[i2], scores[i1]):\n",
    "                is_pareto[i1] = False\n",
    "                break\n",
    "\n",
    "    # Update the 'Pareto' column based on the Pareto front\n",
    "    df.loc[is_pareto, 'Pareto'] = 'Yes'\n",
    "    \n",
    "    return df\n",
    "\n",
    "def find_best_values(df):\n",
    "    # Identify the maximum values for each specified column\n",
    "    max_values = {\n",
    "        'accuracy': df['accuracy'].max(),\n",
    "        'topic_recall': df['topic_recall'].max(),\n",
    "        'overall_balanced_accuracy': df['overall_balanced_accuracy'].max(),\n",
    "        'topic_balanced_accuracy': df['topic_balanced_accuracy'].max(),\n",
    "        'topic_F1': df['topic_F1'].max(),\n",
    "        'overall_F1': df['overall_F1'].max(),\n",
    "        'overall_recall': df['overall_recall'].max()\n",
    "    }\n",
    "    \n",
    "    # Function to apply to each row to determine the best columns\n",
    "    def check_best(row):\n",
    "        return [col for col, max_val in max_values.items() if row[col] == max_val]\n",
    "\n",
    "    # Apply the function to each row\n",
    "    df['best'] = df.apply(check_best, axis=1)\n",
    "    \n",
    "    return df\n",
    "\n",
    "def post_process(df, bch_class_df):\n",
    "    global topic_number\n",
    "    topic_name = topic_dict[topic_number]\n",
    "\n",
    "    bch_topic_recall = bch_class_df.loc[topic_name, 'recall']\n",
    "    bch_topic_balanced_accuracy = bch_class_df.loc[topic_name, 'Balanced Accuracy']\n",
    "    bch_overall_balanced_accuracy = bch_class_df.loc['accuracy', 'Balanced Accuracy']\n",
    "    bch_overall_F1_score = bch_class_df.loc['accuracy', 'f1-score']\n",
    "\n",
    "    # Calculate improvements\n",
    "    df['imp_topic_recall'] = df['topic_recall'] - bch_topic_recall\n",
    "    df['imp_topic_balanced_accuracy'] = df['topic_balanced_accuracy'] - bch_topic_balanced_accuracy\n",
    "    df['imp_overall_balanced_accuracy'] = df['overall_balanced_accuracy'] - bch_overall_balanced_accuracy\n",
    "    df['imp_overall_F1'] = df['overall_F1'] - bch_overall_F1_score\n",
    "\n",
    "    # Calculate cumulative retraining_time\n",
    "    df['cumulative_time'] = df['retraining_time'].cumsum()\n",
    "\n",
    "    # Calculate max and average improvements\n",
    "    df['max_topic_recall_imp'] = df[['imp_topic_recall']].max(axis=1).cummax()\n",
    "    df['average_topic_recall_imp'] = df[['imp_topic_recall']].mean(axis=1).expanding().mean()\n",
    "\n",
    "    df['max_topic_balanced_acc_imp'] = df[['imp_topic_balanced_accuracy']].max(axis=1).cummax()\n",
    "    df['average_topic_balanced_acc_imp'] = df[['imp_topic_balanced_accuracy']].mean(axis=1).expanding().mean()\n",
    "\n",
    "    df['max_overall_balanced_acc_imp'] = df[['imp_overall_balanced_accuracy']].max(axis=1).cummax()\n",
    "    df['average_overall_balanced_acc_imp'] = df[['imp_overall_balanced_accuracy']].mean(axis=1).expanding().mean()\n",
    "\n",
    "    df['max_overall_F1_improvement'] = df[['imp_overall_F1']].max(axis=1).cummax()\n",
    "    df['average_overall_F1_improvement'] = df[['imp_overall_F1']].mean(axis=1).expanding().mean()\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-09-25T22:45:11.756471Z",
     "iopub.status.busy": "2024-09-25T22:45:11.756471Z",
     "iopub.status.idle": "2024-09-25T22:45:11.786260Z",
     "shell.execute_reply": "2024-09-25T22:45:11.784684Z"
    }
   },
   "outputs": [],
   "source": [
    "def bch_classification_report_to_df(report, y_true, y_pred):\n",
    "    global bch_class_df\n",
    "    global topic_dict\n",
    "    df = pd.DataFrame(report).transpose()\n",
    "\n",
    "    # Calculate the confusion matrix\n",
    "    labels = df.index[:-3]  # Exclude 'accuracy', 'macro avg', 'weighted avg'\n",
    "    # Calculate the confusion matrix\n",
    "    cm = confusion_matrix(y_true, y_pred, labels=labels)\n",
    "\n",
    "    # Extracting TP, FP, TN, FN for each class\n",
    "    TP = cm.diagonal()\n",
    "    FP = cm.sum(axis=0) - TP\n",
    "    FN = cm.sum(axis=1) - TP\n",
    "    TN = cm.sum() - (FP + FN + TP)\n",
    "\n",
    "    sens = sum(TP) / (sum(TP)+sum(FN))\n",
    "    spec = sum(TN) / (sum(TN)+sum(FP))\n",
    "    \n",
    "    # Calculate Sensitivity (same as recall)\n",
    "    df['Sensitivity'] = df['recall']\n",
    "    \n",
    "    # Calculate Specificity\n",
    "    tn = cm.sum() - (cm.sum(axis=0) + cm.sum(axis=1) - np.diag(cm))\n",
    "    fp = cm.sum(axis=0) - np.diag(cm)\n",
    "    specificity = tn / (tn + fp)\n",
    "    \n",
    "    # Assign computed specificity to dataframe except for the last three rows\n",
    "    df.loc[df.index[:-3], 'Specificity'] = specificity\n",
    "    \n",
    "    # Handling special cases\n",
    "    # Set 'accuracy' row sensitivity and specificity to the accuracy value\n",
    "    accuracy = df.loc['accuracy', 'precision']  # assuming 'precision' contains the accuracy\n",
    "    df.loc['accuracy', ['Sensitivity', 'Specificity']] = sens, spec\n",
    "    \n",
    "    # Calculate 'macro avg' and 'weighted avg' for sensitivity and specificity\n",
    "    df.loc['macro avg', 'Sensitivity'] = df.iloc[:-3]['Sensitivity'].mean()\n",
    "    df.loc['weighted avg', 'Sensitivity'] = np.average(df.iloc[:-3]['Sensitivity'], weights=df.iloc[:-3]['support'])\n",
    "    \n",
    "    df.loc['macro avg', 'Specificity'] = df.iloc[:-3]['Specificity'].mean()\n",
    "    df.loc['weighted avg', 'Specificity'] = np.average(df.iloc[:-3]['Specificity'], weights=df.iloc[:-3]['support'])\n",
    "\n",
    "    # Calculate Balanced Accuracy for each row, including special averages\n",
    "    df['Balanced Accuracy'] = (df['Sensitivity'] + df['Specificity']) / 2\n",
    "    \n",
    "    return df\n",
    "\n",
    "def train_bch(X_train_re, X_test_re, Y_train_re, Y_test_re, catboost_params, itr0_path):\n",
    "    global X_test_re_Test\n",
    "    global Y_test_re_Test\n",
    "    CPU_monitor_memory_usage()\n",
    "    monitor_gpu_memory()\n",
    "    bch_dict = {}\n",
    "\n",
    "    train_pool_re = Pool(\n",
    "        X_train_re[[\"text\", \"area_TEIS\"]],\n",
    "        Y_train_re,\n",
    "        text_features=[\"text\"],\n",
    "        cat_features=[\"area_TEIS\"]\n",
    "    )\n",
    "    valid_pool_re = Pool(\n",
    "        X_test_re[[\"text\", \"area_TEIS\"]],\n",
    "        Y_test_re,\n",
    "        text_features=[\"text\"],\n",
    "        cat_features=[\"area_TEIS\"]\n",
    "    )\n",
    "\n",
    "    # Model Training\n",
    "    model_re = CatBoostClassifier(**catboost_params)\n",
    "    start_time = time.time()  # Start timing\n",
    "    model_re.fit(train_pool_re, eval_set=valid_pool_re)\n",
    "    training_time = time.time() - start_time  # End timing\n",
    "\n",
    "    # Save the retrain performances\n",
    "    val_predictions = model_re.predict(X_test_re[[\"text\", \"area_TEIS\"]])\n",
    "    val_accuracy = accuracy_score(Y_test_re, val_predictions)\n",
    "    val_report = classification_report(Y_test_re, val_predictions, digits=3, output_dict=True)\n",
    "    print(val_accuracy)\n",
    "    # print(report)\n",
    "    val_classification_df = bch_classification_report_to_df(val_report, Y_test_re, val_predictions)\n",
    "    # print(classification_df)\n",
    "    val_classification_df.to_pickle(f\"{itr0_path}/Validation_Benchmark_M0_Classdf_0.pkl\")\n",
    "    val_classification_df.to_csv(f\"{itr0_path}/Validation_Benchmark_M0_Classdf_0.csv\", index=True)\n",
    "\n",
    "    # Save the retrain performances\n",
    "    predictions = model_re.predict(X_test_re_Test[[\"text\", \"area_TEIS\"]])\n",
    "    accuracy = accuracy_score(Y_test_re_Test, predictions)\n",
    "    report = classification_report(Y_test_re_Test, predictions, digits=3, output_dict=True)\n",
    "    print(accuracy)\n",
    "    # print(report)\n",
    "    classification_df = bch_classification_report_to_df(report, Y_test_re_Test, predictions)\n",
    "    # print(classification_df)\n",
    "\n",
    "    classification_df.to_pickle(f\"{itr0_path}/Benchmark_M0_Classdf_0.pkl\")\n",
    "    classification_df.to_csv(f\"{itr0_path}/Benchmark_M0_Classdf_0.csv\", index=True)\n",
    "\n",
    "    bch_dict['model'] = model_re\n",
    "    bch_dict['classification_df'] = classification_df\n",
    "    bch_dict['accuracy'] = accuracy\n",
    "    bch_dict['retraining_time'] = training_time\n",
    "\n",
    "    return bch_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-09-25T22:45:11.790258Z",
     "iopub.status.busy": "2024-09-25T22:45:11.789225Z",
     "iopub.status.idle": "2024-09-26T00:10:44.879508Z",
     "shell.execute_reply": "2024-09-26T00:10:44.875530Z"
    }
   },
   "outputs": [],
   "source": [
    "if __name__ == \"__main__\":\n",
    "    run = 1\n",
    "    rand = 10\n",
    "    \n",
    "    run_path = f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}\"\n",
    "    itr0_path = f\"{run_path}/Iteration_0\"\n",
    "    os.makedirs(itr0_path, exist_ok=True)\n",
    "\n",
    "    # Load Data\n",
    "    data = pd.read_csv(f'D:/AutoGeTS/Data/tickets_topics.csv',lineterminator='\\n')\n",
    "    data_topic = data.dropna().reset_index()\n",
    "    data_topic = data_topic.rename(columns={'index': 'index_meta'})\n",
    "\n",
    "    X_train_r_both, X_test_re_Test, Y_train_r_both, Y_test_re_Test = train_test_split(data_topic, data_topic.topic_name, test_size = 0.2,random_state = 42)\n",
    "        \n",
    "    # Further split the training set to create a validation set\n",
    "    X_train_r, X_test_re, Y_train_r, Y_test_re = train_test_split(\n",
    "        X_train_r_both, \n",
    "        Y_train_r_both, \n",
    "        test_size=0.2,  # 20% of the initial training set, which is 16% of the original data\n",
    "        random_state=rand\n",
    "    )\n",
    "\n",
    "    catboost_params = {'iterations': 300, 'learning_rate': 0.2, 'depth': 8, 'l2_leaf_reg': 1, \n",
    "                        'bagging_temperature': 1, 'random_strength': 1, 'border_count': 254, \n",
    "                        'eval_metric': 'TotalF1', 'task_type': 'GPU', 'early_stopping_rounds': 20, 'use_best_model': True, 'verbose': 0, 'random_seed': rand}\n",
    "\n",
    "    topic_dict = {\"T1\": \"IT support and assistance.\",\"T2\": \"Account activation and access issues.\",\"T3\": \"Password and device security.\",\n",
    "                    \"T4\": \"Printer issues and troubleshooting.\",\"T5\": \"HP Dock connectivity issues.\",\"T6\": \"Employee documentation and errors.\",\n",
    "                    \"T7\": \"\\\"Access and login issues\\\"\",\"T8\": \"Opening and managing files/devices.\",\"T9\": \"Mobile email and VPN setup.\",\n",
    "                    \"T10\": \"IT support and communication.\",\"T11\": \"Error handling in RPG programming.\",\"T12\": \"Email security and attachments.\",\n",
    "                    \"T13\": \"Humanitarian aid for Ukraine.\",\"T14\": \"Internet connectivity issues in offices.\",\"T15\": \"Improving integration with Infojobs.\"}\n",
    "\n",
    "    bch_dict = train_bch(X_train_r, X_test_re, Y_train_r, Y_test_re, catboost_params, itr0_path)\n",
    "    for iteration in [1]:\n",
    "        if iteration == 1:\n",
    "            topic_number = \"T5\"\n",
    "        # for topic_number in [\"T10\"]: # \"T3\", \"T6\", \"T7\", \"T8\", \"T9\", \"T12\"\n",
    "        # iteration = 3\n",
    "        prev_itr = iteration - 1\n",
    "\n",
    "        gpu_hours = 1\n",
    "\n",
    "        NSW_results_path = f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_{iteration}/{topic_number}_GPU{gpu_hours}h_SSW\"\n",
    "        os.makedirs(NSW_results_path, exist_ok=True)\n",
    "        \n",
    "        \"\"\"Results region\"\"\"\n",
    "        fold_results_df = pd.DataFrame() # pd.read_pickle(\"D:/AutoGeTS/Topic_Experiments/Naive-SW_Results/NSWAllSegs_NSW_T13_ModeBoth_as=T,o,p,i,c,M,i,n,M,a,x_ns=1,6_ws=4_PCA_0PCA_1_PCA_18PCA_19.pkl\") # pd.DataFrame() # pd.read_pickle(\"D:/AutoGeTS/Topic_Experiments/Hierarchical-SW_Results/HSWAllSegs_HSW_T13_ModeBoth_as=T,o,p,i,c,M,i,n,M,a,x_ns=8,4,2_ws=h,a,l,f_PCA_0PCA_1_PCA_18PCA_19.pkl\") # pd.DataFrame()  # DataFrame to collect aggregated results\n",
    "        fold_pfs_df = pd.DataFrame()  # DataFrame to collect details from each fold\n",
    "        pca_pfs_df = pd.DataFrame() # pd.read_pickle(\"D:/AutoGeTS/Topic_Experiments/Naive-SW_Results/NSWpcaPFs_NSW_T13_ModeBoth_as=T,o,p,i,c,M,i,n,M,a,x_ns=1,6_ws=4_PCA_0PCA_1_PCA_18PCA_19.pkl\") # pd.DataFrame() \n",
    "        test_fold_results_df = pd.DataFrame()\n",
    "\n",
    "        history_segments_dict_allPCA = {}\n",
    "        history_pareto_segments_list = []\n",
    "        PCA_stats_df_allPCA = pd.DataFrame(columns=[\"PCA\", \"Number of Evaluations\", \"Best Fitness\", \"Worst Fitness\", \"Mean Overall Accuracy\", \"Mean Topic Recall\", \"Pareto Front Segments\"])\n",
    "\n",
    "        if iteration > 1:\n",
    "            bch_class_df = pd.read_pickle(f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_{prev_itr}/Bch_Itr_{prev_itr}.pkl\")\n",
    "            bch_filtered_columns = [col for col in bch_class_df.columns if not col.startswith(\"Diff\") and col != \"Accuracy\"]\n",
    "            bch_class_df = bch_class_df[bch_filtered_columns]\n",
    "        else:\n",
    "            bch_class_df = pd.read_pickle(f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_0/Benchmark_M0_Classdf_0.pkl\")\n",
    "\n",
    "        bch_m0 = pd.read_pickle(f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_0/Benchmark_M0_Classdf_0.pkl\")\n",
    "        \n",
    "        if iteration > 1:\n",
    "            prev_itr_X_train_re = pd.read_pickle(f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_{prev_itr}/X_train_re_itr_{prev_itr}.pkl\")\n",
    "            X_train_r = prev_itr_X_train_re\n",
    "            prev_itr_Y_train_re = pd.read_pickle(f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_{prev_itr}/Y_train_re_itr_{prev_itr}.pkl\")\n",
    "            Y_train_r = prev_itr_Y_train_re\n",
    "\n",
    "        index_meta_values = X_train_r['index_meta'].unique()\n",
    "        train_PCA_YZ_df = pd.read_pickle(\"D:/AutoGeTS/Data/Train_PCA_YZ_withPred_0.pkl\")\n",
    "        train_PCA_YZ_df = train_PCA_YZ_df[train_PCA_YZ_df['index_meta'].isin(index_meta_values)]\n",
    "        # train_PCA_YZ_df = train_PCA_YZ_df.rename(columns={'index': 'index_meta'})\n",
    "        pca_columns = [col for col in train_PCA_YZ_df.columns if 'PCA_' in col]\n",
    "        pca_pairs = list(itertools.combinations(pca_columns, 2))\n",
    "\n",
    "        topic_dict = {\"T1\": \"IT support and assistance.\",\"T2\": \"Account activation and access issues.\",\"T3\": \"Password and device security.\",\n",
    "                    \"T4\": \"Printer issues and troubleshooting.\",\"T5\": \"HP Dock connectivity issues.\",\"T6\": \"Employee documentation and errors.\",\n",
    "                    \"T7\": \"\\\"Access and login issues\\\"\",\"T8\": \"Opening and managing files/devices.\",\"T9\": \"Mobile email and VPN setup.\",\n",
    "                    \"T10\": \"IT support and communication.\",\"T11\": \"Error handling in RPG programming.\",\"T12\": \"Email security and attachments.\",\n",
    "                    \"T13\": \"Humanitarian aid for Ukraine.\",\"T14\": \"Internet connectivity issues in offices.\",\"T15\": \"Improving integration with Infojobs.\"}\n",
    "\n",
    "        \"\"\"Parameters and Input Section ----------------\"\"\"\n",
    "        # topic_number = \"T13\"\n",
    "        syn_number = 1\n",
    "\n",
    "        dots_mode = \"Both\"\n",
    "\n",
    "        total_gpu_seconds = gpu_hours * 60 * 60\n",
    "        \n",
    "        # Added synthetic data path\n",
    "        if topic_number in [\"T1\", \"T2\"]:\n",
    "            data_syn_raw = pd.read_pickle(f'D:/AutoGeTS/Synthetic_Data/{topic_number}-synthesis-{syn_number}.pkl')\n",
    "        else:\n",
    "            data_syn_raw = pd.read_csv(f'D:/AutoGeTS/Synthetic_Data/{topic_number}-synthesis-{syn_number}.csv',lineterminator='\\n')\n",
    "        data_syn = data_syn_raw[[\"index_meta\", \"text\", \"area_TEIS\", 'topic_name', \"sample\"]].dropna()\n",
    "    \n",
    "        used_pca_pairs = pca_pairs # pca_pairs, [(\"PCA_0\", \"PCA_5\")]\n",
    "\n",
    "        NSW_params = {'area_size': \"TopicMinMax\",\n",
    "                    'num_segments': '16',\n",
    "                    'window_size': '4'\n",
    "                    }\n",
    "\n",
    "        NSW_results_name = f\"NSW_{topic_number}_Mode{dots_mode}_{dict_to_foldername(NSW_params)}_{''.join(item for item in used_pca_pairs[0])}_{''.join(item for item in used_pca_pairs[-1])}\"\n",
    "\n",
    "        \"\"\"------------------------------------------------\"\"\"\n",
    "        topic_name = topic_dict[topic_number]\n",
    "        clean_topic_name = clean_folder_name(topic_name)\n",
    "\n",
    "        sum_GPU_seconds = 0\n",
    "        GPU_limit = False\n",
    "\n",
    "        for pca_i, pca_pair in enumerate(used_pca_pairs):\n",
    "            # if pca_i <= 47:\n",
    "            #     continue\n",
    "            print(f\"Pair {pca_i}\", pca_pair)\n",
    "            CPU_monitor_memory_usage()\n",
    "            monitor_gpu_memory()\n",
    "\n",
    "            current_used_pca_pairs = [pca_pair]\n",
    "\n",
    "            if NSW_params['area_size'] == \"FullSize\":\n",
    "                area_size_x = \"FullSize\"\n",
    "                area_size_y = \"FullSize\"\n",
    "                x_start = \"minimum\"\n",
    "                y_start = \"minimum\"\n",
    "            elif NSW_params['area_size'] == \"TopicMinMax\":\n",
    "                filtered_df = train_PCA_YZ_df[train_PCA_YZ_df['topic_name'] == topic_name]\n",
    "                area_size_x =(filtered_df[pca_pair[0]].min()*1.01, filtered_df[pca_pair[0]].max()*1.01)\n",
    "                area_size_y =(filtered_df[pca_pair[1]].min()*1.01, filtered_df[pca_pair[1]].max()*1.01)\n",
    "                x_start = area_size_x[0] \n",
    "                y_start = area_size_y[0]\n",
    "            else:\n",
    "                area_size_x = NSW_params['area_size']\n",
    "                area_size_y = NSW_params['area_size']\n",
    "                x_start = area_size_x[0]\n",
    "                y_start = area_size_y[0]\n",
    "            num_segments = int(NSW_params['num_segments'])\n",
    "            window_size = int(NSW_params['window_size'])\n",
    "\n",
    "            history_segments_dict = {}\n",
    "            PCA_stats_df = pd.DataFrame(columns=[\"PCA\", \"Number of Evaluations\", \"Best Fitness\", \"Worst Fitness\", \"Mean Overall Accuracy\", \"Mean Topic Recall\", \"Pareto Front Segments\"])\n",
    "            all_segments_dict_name = f\"NSW-Retrain-Dict_{topic_number}_Mode{dots_mode}_{''.join(item for item in used_pca_pairs[0])}_NS{num_segments}_WS{window_size}\"\n",
    "            PCA_stats_df_name = f\"NSW-PCA-Stats_{topic_number}_Mode{dots_mode}_{''.join(item for item in used_pca_pairs[0])}_NS{num_segments}_WS{window_size}\"\n",
    "\n",
    "            all_segments_dict = sliding_window_get_dots(history_segments_dict, train_PCA_YZ_df, data_syn, topic_name, topic_number, area_size_x, area_size_y, x_start, y_start, pca_pairs = current_used_pca_pairs, num_segments=num_segments, window_size=window_size)\n",
    "            history_segments_dict_allPCA.update(all_segments_dict)\n",
    "            # with open(f'{NSW_results_path}/{all_segments_dict_name}_AllPCADictsList.pkl', 'wb') as file:\n",
    "            #     pickle.dump(history_segments_dict_allPCA, file)\n",
    "            \n",
    "            observer_function(all_segments_dict, pca_pair)\n",
    "            PCA_stats_df_allPCA = pd.concat([PCA_stats_df_allPCA, PCA_stats_df], axis=0, ignore_index=True)\n",
    "            PCA_stats_df_allPCA.to_csv(f'{NSW_results_path}/{PCA_stats_df_name}.csv', index=False)\n",
    "            PCA_stats_df_allPCA.to_pickle(f'{NSW_results_path}/{PCA_stats_df_name}.pkl')\n",
    "\n",
    "            pca_pareto_observer(history_pareto_segments_list, pca_pair)\n",
    "            if GPU_limit == True:\n",
    "                fold_results_df  = find_pareto_front(fold_results_df)\n",
    "                fold_results_df = find_best_values(fold_results_df)\n",
    "                fold_results_df = post_process(fold_results_df, bch_class_df)\n",
    "                fold_results_df.to_csv(f'{NSW_results_path}/NSWAllSegs_{NSW_results_name}.csv', index=False)\n",
    "                fold_results_df.to_pickle(f'{NSW_results_path}/NSWAllSegs_{NSW_results_name}.pkl')\n",
    "                # break\n",
    "                test_fold_results_df  = find_pareto_front(test_fold_results_df)\n",
    "                test_fold_results_df = find_best_values(test_fold_results_df)\n",
    "                test_fold_results_df = post_process(test_fold_results_df, bch_class_df)\n",
    "                test_fold_results_df.to_csv(f'{NSW_results_path}/test_NSWAllSegs_{NSW_results_name}.csv', index=True)\n",
    "                test_fold_results_df.to_pickle(f'{NSW_results_path}/test_NSWAllSegs_{NSW_results_name}.pkl')\n",
    "                break\n",
    "        \n",
    "        \"\"\"Extract best model and append synthetics\"\"\"\n",
    "        # Find the index of the row with the largest value in the 'max_overall_balanced_acc_imp' column\n",
    "        index_of_max_imp = test_fold_results_df['imp_overall_balanced_accuracy'].idxmax()\n",
    "        print(index_of_max_imp)\n",
    "\n",
    "        # Retrieve the row corresponding to this index\n",
    "        row_with_largest_value = test_fold_results_df.loc[index_of_max_imp]\n",
    "\n",
    "        filtered_syn_df = data_syn[data_syn['index_meta'].isin(row_with_largest_value['retrained_dots_list'])]\n",
    "\n",
    "        X_train_re = pd.concat([X_train_r, filtered_syn_df.drop(columns=['topic_name'])])\n",
    "        Y_train_re = pd.concat([Y_train_r, filtered_syn_df['topic_name']])\n",
    "\n",
    "        train_pool_re = Pool(\n",
    "            X_train_re[[\"text\", \"area_TEIS\"]],\n",
    "            Y_train_re,\n",
    "            text_features=[\"text\"],\n",
    "            cat_features=[\"area_TEIS\"]\n",
    "        )\n",
    "        valid_pool_re = Pool(\n",
    "            X_test_re[[\"text\", \"area_TEIS\"]],\n",
    "            Y_test_re,\n",
    "            text_features=[\"text\"],\n",
    "            cat_features=[\"area_TEIS\"]\n",
    "        )\n",
    "\n",
    "        catboost_params = catboost_params\n",
    "                    \n",
    "        # Model Training\n",
    "        model_re = CatBoostClassifier(**catboost_params)\n",
    "        # start_time = time.time()  # Start timing\n",
    "        model_re.fit(train_pool_re, eval_set=valid_pool_re)\n",
    "        # training_time = time.time() - start_time  # End timing\n",
    "\n",
    "        # Save the retrain performances\n",
    "        predictions = model_re.predict(X_test_re_Test[[\"text\", \"area_TEIS\"]])\n",
    "        accuracy = accuracy_score(Y_test_re_Test, predictions)\n",
    "        report = classification_report(Y_test_re_Test, predictions, digits=6, output_dict=True)\n",
    "        classification_df = classification_report_to_df(report, Y_test_re_Test, predictions)\n",
    "\n",
    "        print(classification_df)\n",
    "\n",
    "        iteration_repo = f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_{iteration}/\"\n",
    "        os.makedirs(iteration_repo, exist_ok=True)\n",
    "        if classification_df.loc[topic_dict[\"T13\"], 'Diff Balanced Accuracy'] >= 0:\n",
    "            classification_df.to_csv(f\"{iteration_repo}/Bch_Itr_{iteration}.csv\", index=True)\n",
    "            classification_df.to_pickle(f\"{iteration_repo}/Bch_Itr_{iteration}.pkl\")\n",
    "\n",
    "            X_train_re.to_pickle(f\"{iteration_repo}/X_train_re_itr_{iteration}.pkl\")\n",
    "            Y_train_re.to_pickle(f\"{iteration_repo}/Y_train_re_itr_{iteration}.pkl\")\n",
    "        else:\n",
    "            iteration_noimprove_repo = f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_{iteration}/Itr_No_Improve\"\n",
    "            os.makedirs(iteration_noimprove_repo, exist_ok=True)\n",
    "            classification_df.to_csv(f\"{iteration_noimprove_repo}/Bch_Itr_{iteration}.csv\", index=True)\n",
    "            classification_df.to_pickle(f\"{iteration_noimprove_repo}/Bch_Itr_{iteration}.pkl\")\n",
    "\n",
    "            X_train_re.to_pickle(f\"{iteration_noimprove_repo}/X_train_re_itr_{iteration}.pkl\")\n",
    "            Y_train_re.to_pickle(f\"{iteration_noimprove_repo}/Y_train_re_itr_{iteration}.pkl\")\n",
    "\n",
    "            bch_class_df.to_csv(f\"{iteration_repo}/Bch_Itr_{iteration}.csv\", index=True)\n",
    "            bch_class_df.to_pickle(f\"{iteration_repo}/Bch_Itr_{iteration}.pkl\")\n",
    "\n",
    "            prev_itr_X_train_re.to_pickle(f\"{iteration_repo}/X_train_re_itr_{iteration}.pkl\")\n",
    "            prev_itr_Y_train_re.to_pickle(f\"{iteration_repo}/Y_train_re_itr_{iteration}.pkl\")\n",
    "        # PCA_stats_df_allPCA = pd.concat([PCA_stats_df_allPCA, PCA_stats_df], axis=0, ignore_index=True)\n",
    "        # PCA_stats_df_allPCA.to_csv(f'{NSW_results_path}/{PCA_stats_df_name}.csv', index=False)\n",
    "        # PCA_stats_df_allPCA.to_pickle(f'{NSW_results_path}/{PCA_stats_df_name}.pkl')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
