{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-09-26T01:18:22.296287Z",
     "iopub.status.busy": "2024-09-26T01:18:22.295288Z",
     "iopub.status.idle": "2024-09-26T01:18:23.986076Z",
     "shell.execute_reply": "2024-09-26T01:18:23.985038Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import itertools\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "\n",
    "from catboost import CatBoostClassifier, Pool\n",
    "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import time\n",
    "import pickle\n",
    "import psutil\n",
    "import gc\n",
    "import random\n",
    "import pynvml\n",
    "\n",
    "random.seed(42)\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-09-26T01:18:23.992073Z",
     "iopub.status.busy": "2024-09-26T01:18:23.991072Z",
     "iopub.status.idle": "2024-09-26T01:18:24.016190Z",
     "shell.execute_reply": "2024-09-26T01:18:24.016190Z"
    }
   },
   "outputs": [],
   "source": [
    "# Function to clean folder names\n",
    "def clean_folder_name(folder_name):\n",
    "    # Remove invalid characters\n",
    "    cleaned_name = re.sub(r'[<>:\"/\\\\|?*]', '', folder_name)\n",
    "    # Remove trailing dots and spaces\n",
    "    cleaned_name = cleaned_name.rstrip('. ')\n",
    "    return cleaned_name\n",
    "\n",
    "def CPU_monitor_memory_usage():\n",
    "    memory_info = psutil.virtual_memory()\n",
    "    memory_usage = memory_info.percent\n",
    "        \n",
    "    print(f\"CPU Current memory usage: {memory_usage}%\")\n",
    "\n",
    "    if memory_usage >= 95:\n",
    "        print(\"CPU Memory usage is too high. Pausing execution...\")\n",
    "        gc.collect()  # Trigger garbage collection manually\n",
    "        while memory_usage > 30:\n",
    "            time.sleep(10)\n",
    "            memory_info = psutil.virtual_memory()\n",
    "            memory_usage = memory_info.percent\n",
    "        print(\"CPU Memory usage is low enough. Resuming execution...\")\n",
    "\n",
    "    # time.sleep(5)\n",
    "\n",
    "def monitor_gpu_memory():\n",
    "    # Initialize NVML\n",
    "    pynvml.nvmlInit()\n",
    "    \n",
    "    try:\n",
    "        # Get handle for the first GPU\n",
    "        handle = pynvml.nvmlDeviceGetHandleByIndex(0)\n",
    "\n",
    "        # Get memory info\n",
    "        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)\n",
    "        total_memory = mem_info.total\n",
    "        used_memory = mem_info.used\n",
    "\n",
    "        # Calculate the percentage of GPU memory used\n",
    "        memory_usage = (used_memory / total_memory) * 100\n",
    "        print(f\"Current GPU memory usage: {memory_usage:.2f}%\")\n",
    "\n",
    "        # Check if memory usage is too high\n",
    "        if memory_usage >= 95:\n",
    "            print(\"GPU memory usage is too high. Pausing execution...\")\n",
    "            while memory_usage > 30:\n",
    "                time.sleep(10)\n",
    "                mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)\n",
    "                used_memory = mem_info.used\n",
    "                memory_usage = (used_memory / total_memory) * 100\n",
    "            print(\"GPU memory usage is low enough. Resuming execution...\")\n",
    "\n",
    "    finally:\n",
    "        # Clean up\n",
    "        pynvml.nvmlShutdown()\n",
    "\n",
    "\n",
    "def dict_to_foldername(param_dict, max_values=2):\n",
    "    # Flatten the dictionary into a string of 'key=abbreviated_values' pairs\n",
    "    parts = []\n",
    "    for key, values in param_dict.items():\n",
    "        # Abbreviate key names\n",
    "        short_key = ''.join(word[0] for word in key.split('_'))\n",
    "        \n",
    "        # Convert list of values to a comma-separated string\n",
    "        value_str = ','.join(map(str, values))\n",
    "        # Form key=value string and append to the parts list\n",
    "        parts.append(f\"{short_key}={value_str}\")\n",
    "\n",
    "    # Join all parts with a separator and prepend prefix\n",
    "    folder_name = \"_\".join(parts)\n",
    "\n",
    "    # Replace any potentially problematic characters (if any)\n",
    "    folder_name = folder_name.replace(\":\", \"-\").replace(\"/\", \"-\").replace(\"\\\\\", \"-\")\n",
    "\n",
    "    return folder_name\n",
    "\n",
    "def make_hashable(dict_obj):\n",
    "    return tuple(sorted(dict_obj.items()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-09-26T01:18:24.021192Z",
     "iopub.status.busy": "2024-09-26T01:18:24.020191Z",
     "iopub.status.idle": "2024-09-26T01:18:24.108790Z",
     "shell.execute_reply": "2024-09-26T01:18:24.108790Z"
    }
   },
   "outputs": [],
   "source": [
    "def extract_coordinates(coord):\n",
    "    # Split into x and y components\n",
    "    x, y = coord.split(', ')\n",
    "\n",
    "    # Custom function to split by hyphen, ignoring '-' after 'e'\n",
    "    def split_by_hyphen(s):\n",
    "        parts = []\n",
    "        i = 0\n",
    "        hp_index = []\n",
    "        while i < len(s):\n",
    "            if s[i] == '-' and (i == 0 or s[i-1] != 'e'):\n",
    "                hp_index.append(i)\n",
    "            i += 1\n",
    "        if 0 in hp_index:\n",
    "            parts.append(s[:hp_index[0]])\n",
    "            parts.append(s[hp_index[0]+1:hp_index[1]])\n",
    "            parts.append(s[hp_index[1]+1:])\n",
    "            return float(parts[1]) * -1, float(parts[2])\n",
    "        else:\n",
    "            parts.append(s[:hp_index[0]])\n",
    "            parts.append(s[hp_index[0]+1:])\n",
    "            return float(parts[0]), float(parts[1])\n",
    "\n",
    "    # Apply the split function to both x and y components\n",
    "    x_start, x_end = split_by_hyphen(x)\n",
    "    y_start, y_end = split_by_hyphen(y)\n",
    "\n",
    "    return x_start, x_end, y_start, y_end\n",
    "\n",
    "\n",
    "def classification_report_to_df(report, y_true, y_pred):\n",
    "    global bch_class_df\n",
    "    global topic_dict\n",
    "    global iteration\n",
    "    df = pd.DataFrame(report).transpose()\n",
    "\n",
    "    order_labels = list(topic_dict.values())\n",
    "\n",
    "    # Calculate the confusion matrix\n",
    "    labels = df.index[:-3]  # Exclude 'accuracy', 'macro avg', 'weighted avg'\n",
    "    # Calculate the confusion matrix\n",
    "    cm = confusion_matrix(y_true, y_pred, labels=labels)\n",
    "\n",
    "    # Extracting TP, FP, TN, FN for each class\n",
    "    TP = cm.diagonal()\n",
    "    FP = cm.sum(axis=0) - TP\n",
    "    FN = cm.sum(axis=1) - TP\n",
    "    TN = cm.sum() - (FP + FN + TP)\n",
    "\n",
    "    sens = sum(TP) / (sum(TP)+sum(FN))\n",
    "    spec = sum(TN) / (sum(TN)+sum(FP))\n",
    "    \n",
    "    # Calculate Sensitivity (same as recall)\n",
    "    df['Sensitivity'] = df['recall']\n",
    "    \n",
    "    # Calculate Specificity\n",
    "    tn = cm.sum() - (cm.sum(axis=0) + cm.sum(axis=1) - np.diag(cm))\n",
    "    fp = cm.sum(axis=0) - np.diag(cm)\n",
    "    specificity = tn / (tn + fp)\n",
    "    \n",
    "    # Assign computed specificity to dataframe except for the last three rows\n",
    "    df.loc[df.index[:-3], 'Specificity'] = specificity\n",
    "    \n",
    "    # Handling special cases\n",
    "    # Set 'accuracy' row sensitivity and specificity to the accuracy value\n",
    "    accuracy = df.loc['accuracy', 'precision']  # assuming 'precision' contains the accuracy\n",
    "    df.loc['accuracy', ['Sensitivity', 'Specificity']] = sens, spec\n",
    "    \n",
    "    # Calculate 'macro avg' and 'weighted avg' for sensitivity and specificity\n",
    "    df.loc['macro avg', 'Sensitivity'] = df.iloc[:-3]['Sensitivity'].mean()\n",
    "    df.loc['weighted avg', 'Sensitivity'] = np.average(df.iloc[:-3]['Sensitivity'], weights=df.iloc[:-3]['support'])\n",
    "    \n",
    "    df.loc['macro avg', 'Specificity'] = df.iloc[:-3]['Specificity'].mean()\n",
    "    df.loc['weighted avg', 'Specificity'] = np.average(df.iloc[:-3]['Specificity'], weights=df.iloc[:-3]['support'])\n",
    "\n",
    "    # Calculate Balanced Accuracy for each row, including special averages\n",
    "    df['Balanced Accuracy'] = (df['Sensitivity'] + df['Specificity']) / 2\n",
    "\n",
    "    df.loc['accuracy', 'precision'] = sum(TP) / (sum(TP) + sum(FP))\n",
    "    df.loc['accuracy', 'recall'] = sum(TP) / (sum(TP) + sum(FN))\n",
    "    df.loc['accuracy', 'f1-score'] = 2* sum(TP) / (2 * sum(TP) + sum(FP) + sum(FN))\n",
    "\n",
    "    if iteration > 1:\n",
    "        bch_class_df_noFr = bch_class_df.drop(columns=['TP', 'FP', 'TN', 'FN'])\n",
    "    else: \n",
    "        bch_class_df_noFr = bch_class_df\n",
    "\n",
    "    diff_df = df - bch_class_df_noFr\n",
    "    # Renaming columns for clarity\n",
    "    diff_df.columns = ['Diff ' + col for col in diff_df.columns]\n",
    "\n",
    "    # Concatenating the original dataframe with the differences\n",
    "    combined_df = pd.concat([df, diff_df], axis=1)\n",
    "\n",
    "    class_accuracy = cm.diagonal() / cm.sum(axis=1)\n",
    "    combined_df.loc[labels, 'Accuracy'] = class_accuracy\n",
    "    # Copying f1-score to 'Accuracy' for the last three rows\n",
    "    combined_df.loc[['accuracy', 'macro avg', 'weighted avg'], 'Accuracy'] = combined_df.loc[['accuracy', 'macro avg', 'weighted avg'], 'f1-score']\n",
    "\n",
    "    # Calculate and append TP, FP, TN, FN metrics\n",
    "    metrics_df = pd.DataFrame({\n",
    "        \"TP\": TP,\n",
    "        \"FP\": FP,\n",
    "        \"TN\": TN,\n",
    "        \"FN\": FN\n",
    "    }, index=labels)\n",
    "\n",
    "    # Merge the new metrics into the existing DataFrame\n",
    "    combined_df = combined_df.merge(metrics_df, left_index=True, right_index=True, how='left')\n",
    "\n",
    "    # Reorder DataFrame based on specified order labels\n",
    "    combined_df = combined_df.reindex(order_labels + ['accuracy', 'macro avg', 'weighted avg'])\n",
    "\n",
    "    return combined_df\n",
    "\n",
    "\n",
    "def dominates(score1, score2):\n",
    "    return (score1[0] > score2[0] and score1[1] >= score2[1]) or (score1[0] >= score2[0] and score1[1] > score2[1])\n",
    "\n",
    "def extract_non_dominated_segments(segments, level, prev_key):\n",
    "    global metric_name\n",
    "    global topic_name\n",
    "\n",
    "    # Check if all fitness scores are (None, None)\n",
    "    if all(seg['fitness_score'] == (None, None) for seg in segments.values()):\n",
    "        print(f'No fitness scores for {prev_key} in level {level}')\n",
    "        return {}  # Return an empty dictionary to terminate further processing\n",
    "\n",
    "    non_dominated_segments = {}\n",
    "    best_value = float('-inf')  # Initialize with the lowest possible value\n",
    "    \n",
    "    # Iterate through each segment to find those with the best metric value\n",
    "    for key, segment in segments.items():\n",
    "        classification_df = segment['classification_df']\n",
    "        if metric_name == \"overall_balanced_accuracy\":\n",
    "            metric_value = classification_df.loc['accuracy', 'Balanced Accuracy']\n",
    "        elif metric_name == \"overall_f1-score\":\n",
    "            metric_value = classification_df.loc['accuracy', 'f1-score']\n",
    "        else:\n",
    "            metric_value = classification_df.loc[topic_name, metric_name]\n",
    "\n",
    "        # Skip segments where the metric value is None\n",
    "        if metric_value is None:\n",
    "            continue\n",
    "\n",
    "        # Check if the current segment's metric value is higher than the current best\n",
    "        if metric_value > best_value:\n",
    "            best_value = metric_value\n",
    "            non_dominated_segments = {key: segment}  # Start a new dictionary for the new best value\n",
    "        elif metric_value == best_value:\n",
    "            non_dominated_segments[key] = segment  # Add the current segment to the dictionary as it matches the best value\n",
    "\n",
    "    observer_function(segments, non_dominated_segments, level, prev_key)\n",
    "    return non_dominated_segments\n",
    "\n",
    "\n",
    "def get_current_level_segments(history_dict, level):\n",
    "    grouped_segments = {}\n",
    "    for key, value in history_dict.items():\n",
    "        if key[-2] == level-1:\n",
    "            prev_key = key[-1]\n",
    "            if prev_key not in grouped_segments:\n",
    "                grouped_segments[prev_key] = {}\n",
    "            grouped_segments[prev_key][key] = value\n",
    "    \n",
    "    all_non_dominated_segments = {}\n",
    "    for prev_key, segments in grouped_segments.items():\n",
    "        non_dominated_segments = extract_non_dominated_segments(segments, level, prev_key)\n",
    "        if non_dominated_segments:\n",
    "            all_non_dominated_segments[prev_key] = non_dominated_segments\n",
    "\n",
    "    return all_non_dominated_segments\n",
    "\n",
    "\n",
    "def observer_function(segments, non_dominated_segments, level, prev_key):\n",
    "    global topic_name\n",
    "    global topic_number\n",
    "    global HSW_results_path\n",
    "    global level_stats_df_name\n",
    "    global level_stats_df\n",
    "    global history_pareto_segments_list\n",
    "    global HSW_results_name\n",
    "    global fold_results_df\n",
    "    global pca_pairs_used\n",
    "\n",
    "    for i, (key, seg) in enumerate(segments.items()):\n",
    "        classification_df = seg['classification_df']\n",
    "\n",
    "        new_row_index = len(fold_results_df)\n",
    "        class_DF_path = f'{HSW_results_path}/Class_DF'\n",
    "        os.makedirs(class_DF_path, exist_ok=True)\n",
    "        classification_df.to_csv(f'{class_DF_path}/{topic_number}_HSW_{new_row_index}_AllEval_ClassDF.csv', index=True)\n",
    "        classification_df.to_pickle(f'{class_DF_path}/{topic_number}_HSW_{new_row_index}_AllEval_ClassDF.pkl')\n",
    "\n",
    "        # Collect all generation data into a new DataFrame row\n",
    "        new_ParamCV_row = {\n",
    "            \"topic_name\": topic_name,\n",
    "            \"topic_number\": topic_number,\n",
    "            \"PCA_index\": pca_pairs_used.index(seg['PCA']),\n",
    "            \"PCA\": seg['PCA'],\n",
    "            \"coordinates\": seg['coordinates'],\n",
    "            'fitness_score': seg['fitness_score'],\n",
    "            \"accuracy\": seg['fitness_score'][0],\n",
    "            \"topic_recall\": seg['fitness_score'][1],\n",
    "            'balanced_fitness_score': (classification_df.loc['accuracy', 'Balanced Accuracy'], classification_df.loc[topic_name, 'Balanced Accuracy']),\n",
    "            'overall_balanced_accuracy': classification_df.loc['accuracy', 'Balanced Accuracy'],\n",
    "            'topic_balanced_accuracy': classification_df.loc[topic_name, 'Balanced Accuracy'],\n",
    "            'balanced_acc_rec_score': (classification_df.loc[topic_name, 'Balanced Accuracy'], classification_df.loc[topic_name, 'recall']),\n",
    "            'topic_F1': classification_df.loc[topic_name, 'f1-score'],\n",
    "            'overall_F1': classification_df.loc['accuracy', 'f1-score'],\n",
    "            'overall_recall': classification_df.loc['accuracy', 'recall'],\n",
    "            \"retraining_time\": seg[\"retraining_time\"],\n",
    "            \"number_of_syn_sample\": seg['number_of_syn_sample'],\n",
    "            \"retrained_dots_list\": seg['retrained_dots_list'],\n",
    "            'true_labels': seg['true_labels'],\n",
    "            'predicted_labels': seg['predicted_labels'],\n",
    "            \"segment_key\": key,\n",
    "            'classDF_path': f'{class_DF_path}/{topic_number}_HSW_{new_row_index}_AllEval_ClassDF.csv'\n",
    "        }\n",
    "        # # Transform DataFrame to dict format\n",
    "        # for idx in classification_df.index:\n",
    "        #     if idx != 'accuracy' and idx != 'macro avg' and idx != 'weighted avg':\n",
    "        #         new_ParamCV_row[idx] = classification_df.loc[idx].dropna().to_dict()\n",
    "        # Convert the dictionary to a DataFrame for a single row\n",
    "        new_ParamCV_row_df = pd.DataFrame([new_ParamCV_row])\n",
    "        # Concatenate this new row DataFrame to the existing DataFrame\n",
    "        fold_results_df = pd.concat([fold_results_df, new_ParamCV_row_df], ignore_index=True)\n",
    "        fold_results_df.to_csv(f'{HSW_results_path}/HSWAllSegs_{HSW_results_name}.csv', index=False)\n",
    "        fold_results_df.to_pickle(f'{HSW_results_path}/HSWAllSegs_{HSW_results_name}.pkl')\n",
    "    \n",
    "    objective_1_values = [seg['fitness_score'][0] for seg in segments.values() if seg['fitness_score'][0] is not None]\n",
    "    objective_2_values = [seg['fitness_score'][1] for seg in segments.values() if seg['fitness_score'][1] is not None]\n",
    "\n",
    "    # Check if both lists are empty or filled with zeros\n",
    "    if not objective_1_values or not objective_2_values or all(value == 0 for value in objective_1_values + objective_2_values):\n",
    "        print(\"All objectives are None or 0, skipping further processes.\")\n",
    "        return  # Exit the function\n",
    "\n",
    "    pareto_fitness_tuples = [(ft[0], ft[1]) for ft in [seg['fitness_score'] for seg in non_dominated_segments.values()] if ft[0] is not None and ft[1] is not None]\n",
    "    pareto_segments_tuples = [(seg['coordinates'], seg['number_of_syn_sample'], seg['retrained_dots_list']) for seg in non_dominated_segments.values()]\n",
    "\n",
    "    prev_key_plot = prev_key[:-1]\n",
    "    # Calculate statistics\n",
    "    worst_fitness = (min(objective_1_values), min(objective_2_values))\n",
    "    best_fitness = (max(objective_1_values), max(objective_2_values))\n",
    "    mean_OverallAcc = np.mean(objective_1_values)\n",
    "    mean_ClassRecall = np.mean(objective_2_values)\n",
    "\n",
    "    print(f\"Level: {level-1}, Previous Key: {prev_key_plot}, Evaluations: {len(segments)}\")\n",
    "    print(f\"Best Fitness: {best_fitness}\")\n",
    "    print(f\"Worst Fitness: {worst_fitness}\")\n",
    "    print(f\"Mean Overall Accuracy: {mean_OverallAcc}\")\n",
    "    print(f\"Mean {topic_name} Recall: {mean_ClassRecall}\")\n",
    "\n",
    "    print(\"Pareto Front Selections:---------------------\")\n",
    "    for i, (key, seg) in enumerate(non_dominated_segments.items()):\n",
    "        print(f\"Segment {i+1}: {seg['coordinates']}, Num Samples {seg['number_of_syn_sample']}: {seg['retrained_dots_list']} \\nFitness: {seg['fitness_score']}\")\n",
    "        history_pareto_segments_list.append(key)\n",
    "        print('---')\n",
    "    print('------------------------')\n",
    "\n",
    "    # Plotting\n",
    "    plt.figure(figsize=(12, 7))\n",
    "    plt.scatter(objective_1_values, objective_2_values, c='blue', alpha=0.5, label='Population')\n",
    "    if pareto_fitness_tuples:\n",
    "        plt.scatter([ft[0] for ft in pareto_fitness_tuples], [ft[1] for ft in pareto_fitness_tuples], c='red', alpha=0.9, label='Pareto Front')\n",
    "    \n",
    "    plt.xlabel('Objective 1: Overall Accuracy')\n",
    "    plt.ylabel(f'Objective 2: {topic_name} Recall')\n",
    "    plt.title(f'{topic_number}, Population and Pareto Front at Level {level-1}, Previous Key: {prev_key_plot}')\n",
    "    plt.legend()\n",
    "    pareto_plots_dir = f\"{HSW_results_path}/{level_stats_df_name}\"\n",
    "    os.makedirs(pareto_plots_dir, exist_ok=True)\n",
    "    filename = f\"{prev_key_plot[2][0]}{prev_key_plot[2][1]}_Level_{level-1}_{prev_key_plot}.png\"\n",
    "    # plt.savefig(os.path.join(pareto_plots_dir, filename), dpi=200, bbox_inches='tight', pad_inches=0)\n",
    "    plt.show()\n",
    "\n",
    "    recall_key = f\"Mean {topic_name} Recall\"\n",
    "    # Collect all generation data into a new DataFrame row\n",
    "    new_row = {\n",
    "        \"PCA\": f\"{prev_key_plot[2][0]}{prev_key_plot[2][1]}\",\n",
    "        \"Level\": level-1,\n",
    "        \"Previous Key\": prev_key_plot,\n",
    "        \"Number of Evaluations\": len(segments),\n",
    "        \"Best Fitness\": best_fitness,\n",
    "        \"Worst Fitness\": worst_fitness,\n",
    "        \"Mean Overall Accuracy\": mean_OverallAcc,\n",
    "        recall_key: mean_ClassRecall,\n",
    "        \"Pareto Front Segments\": [pareto_segments_tuples, pareto_fitness_tuples]\n",
    "    }\n",
    "    # Convert the dictionary to a DataFrame for a single row\n",
    "    new_row_df = pd.DataFrame([new_row])\n",
    "    # Concatenate this new row DataFrame to the existing DataFrame\n",
    "    level_stats_df = pd.concat([level_stats_df, new_row_df], ignore_index=True)\n",
    "    level_stats_df.to_csv(f'{HSW_results_path}/{level_stats_df_name}/{prev_key_plot[2][0]}{prev_key_plot[2][1]}.csv', index=False)\n",
    "    level_stats_df.to_pickle(f'{HSW_results_path}/{level_stats_df_name}/{prev_key_plot[2][0]}{prev_key_plot[2][1]}.pkl')\n",
    "\n",
    "def dominates(score1, score2):\n",
    "    return (score1[0] > score2[0] and score1[1] >= score2[1]) or (score1[0] >= score2[0] and score1[1] > score2[1])\n",
    "\n",
    "def final_pareto_observer(history_pareto_segments_list):\n",
    "    global history_segments_dict_allPCA\n",
    "    global topic_name\n",
    "    global topic_number\n",
    "    global HSW_results_path\n",
    "    global level_stats_df_name\n",
    "    global level_stats_df\n",
    "    global fold_pfs_df\n",
    "    global HSW_results_name\n",
    "    global pca_pairs_used\n",
    "\n",
    "    objective_1_values = [history_segments_dict_allPCA[seg_key]['fitness_score'][0] for seg_key in history_pareto_segments_list if history_segments_dict_allPCA[seg_key]['fitness_score'][0] is not None]\n",
    "    objective_2_values = [history_segments_dict_allPCA[seg_key]['fitness_score'][1] for seg_key in history_pareto_segments_list if history_segments_dict_allPCA[seg_key]['fitness_score'][1] is not None]\n",
    "\n",
    "    # Check if both lists are empty or filled with zeros\n",
    "    if not objective_1_values or not objective_2_values or all(value == 0 for value in objective_1_values + objective_2_values):\n",
    "        print(\"All objectives are None or 0, skipping further processes.\")\n",
    "        return  # Exit the function\n",
    "\n",
    "    non_dominated_segments = {}\n",
    "    for i1, key1 in enumerate(history_pareto_segments_list):\n",
    "        if history_segments_dict_allPCA[key1]['fitness_score'] == (None, None):\n",
    "            continue  # Skip non-evaluable segments\n",
    "\n",
    "        dominated = False\n",
    "        for i2, key2 in enumerate(history_pareto_segments_list):\n",
    "            if i1 != i2 and dominates(history_segments_dict_allPCA[key2]['fitness_score'], history_segments_dict_allPCA[key1]['fitness_score']):\n",
    "                dominated = True\n",
    "                break\n",
    "        if not dominated:\n",
    "            non_dominated_segments[key1] = history_segments_dict_allPCA[key1]\n",
    "\n",
    "    pareto_fitness_tuples = [(ft[0], ft[1]) for ft in [history_segments_dict_allPCA[seg_key]['fitness_score'] for seg_key in non_dominated_segments.keys()] if ft[0] is not None and ft[1] is not None]\n",
    "    pareto_segments_tuples = [(seg_key[2], seg_key[3], history_segments_dict_allPCA[seg_key]['number_of_syn_sample'], history_segments_dict_allPCA[seg_key]['retrained_dots_list']) for seg_key in non_dominated_segments.keys()]\n",
    "\n",
    "    # Calculate statistics\n",
    "    worst_fitness = (min(objective_1_values), min(objective_2_values))\n",
    "    best_fitness = (max(objective_1_values), max(objective_2_values))\n",
    "    mean_OverallAcc = np.mean(objective_1_values)\n",
    "    mean_ClassRecall = np.mean(objective_2_values)\n",
    "\n",
    "    print(f\"All Pareto Segments in History, Evaluations: {len(history_pareto_segments_list)}\")\n",
    "    print(f\"Best Fitness: {best_fitness}\")\n",
    "    print(f\"Worst Fitness: {worst_fitness}\")\n",
    "    print(f\"Mean Overall Accuracy: {mean_OverallAcc}\")\n",
    "    print(f\"Mean {topic_name} Recall: {mean_ClassRecall}\")\n",
    "\n",
    "    print(\"Pareto Front Selections:---------------------\")\n",
    "    for i, (key, seg) in enumerate(non_dominated_segments.items()):\n",
    "        print(f\"Segment {i+1}: {seg['coordinates']}, {seg['PCA']}, Num Samples {seg['number_of_syn_sample']}: {seg['retrained_dots_list']} \\nFitness: {seg['fitness_score']}\")\n",
    "\n",
    "        classification_df = seg['classification_df']\n",
    "        # Collect all generation data into a new DataFrame row\n",
    "        new_PFs_row = {\n",
    "            \"topic_name\": topic_name,\n",
    "            \"topic_number\": topic_number,\n",
    "            \"PCA_index\": pca_pairs_used.index(seg['PCA']),\n",
    "            \"PCA\": seg['PCA'],\n",
    "            \"coordinates\": seg['coordinates'],\n",
    "            'fitness_score': seg['fitness_score'],\n",
    "            \"accuracy\": seg['fitness_score'][0],\n",
    "            \"topic_recall\": seg['fitness_score'][1],\n",
    "            \"retraining_time\": seg[\"retraining_time\"],\n",
    "            \"number_of_syn_sample\": seg['number_of_syn_sample'],\n",
    "            \"retrained_dots_list\": seg['retrained_dots_list'],\n",
    "            'true_labels': seg['true_labels'],\n",
    "            'predicted_labels': seg['predicted_labels'],\n",
    "            \"segment_key\": key\n",
    "        }\n",
    "        # # Transform DataFrame to dict format\n",
    "        # for idx in classification_df.index:\n",
    "        #     if idx != 'accuracy' and idx != 'macro avg' and idx != 'weighted avg':\n",
    "        #         new_PFs_row[idx] = classification_df.loc[idx].dropna().to_dict()\n",
    "        # Convert the dictionary to a DataFrame for a single row\n",
    "        new_PFs_row_df = pd.DataFrame([new_PFs_row])\n",
    "        # Concatenate this new row DataFrame to the existing DataFrame\n",
    "        fold_pfs_df = pd.concat([fold_pfs_df, new_PFs_row_df], ignore_index=True)\n",
    "        fold_pfs_df.to_csv(f'{HSW_results_path}/HSWPFs_{HSW_results_name}.csv', index=False)\n",
    "        fold_pfs_df.to_pickle(f'{HSW_results_path}/HSWPFs_{HSW_results_name}.pkl')\n",
    "        print('---')\n",
    "    print('------------------------')\n",
    "\n",
    "    # Plotting\n",
    "    plt.figure(figsize=(12, 7))\n",
    "    plt.scatter(objective_1_values, objective_2_values, c='blue', alpha=0.5, label='Population')\n",
    "    if pareto_fitness_tuples:\n",
    "        plt.scatter([ft[0] for ft in pareto_fitness_tuples], [ft[1] for ft in pareto_fitness_tuples], c='red', alpha=0.9, label='Pareto Front')\n",
    "    \n",
    "    plt.xlabel('Objective 1: Overall Accuracy')\n",
    "    plt.ylabel(f'Objective 2: {topic_name} Recall')\n",
    "    plt.title(f'{topic_number}, Population and Pareto Front for All Pareto Segments in History')\n",
    "    plt.legend()\n",
    "    pareto_plots_dir = f\"{HSW_results_path}/{level_stats_df_name}\"\n",
    "    os.makedirs(pareto_plots_dir, exist_ok=True)\n",
    "    filename = f\"All Pareto Segments in History.png\"\n",
    "    # plt.savefig(os.path.join(pareto_plots_dir, filename), dpi=200, bbox_inches='tight', pad_inches=0)\n",
    "    plt.show()\n",
    "\n",
    "    recall_key = f\"Mean {topic_name} Recall\"\n",
    "    # Collect all generation data into a new DataFrame row\n",
    "    new_row = {\n",
    "        \"PCA\": f\"All PCAs\",\n",
    "        \"Level\": \"All Levels\",\n",
    "        \"Previous Key\": \"No Prev Key\",\n",
    "        \"Number of Evaluations\": len(history_pareto_segments_list),\n",
    "        \"Best Fitness\": best_fitness,\n",
    "        \"Worst Fitness\": worst_fitness,\n",
    "        \"Mean Overall Accuracy\": mean_OverallAcc,\n",
    "        recall_key: mean_ClassRecall,\n",
    "        \"Pareto Front Segments\": [pareto_segments_tuples, pareto_fitness_tuples]\n",
    "    }\n",
    "    # Convert the dictionary to a DataFrame for a single row\n",
    "    new_row_df = pd.DataFrame([new_row])\n",
    "    # Concatenate this new row DataFrame to the existing DataFrame\n",
    "    level_stats_df = pd.concat([level_stats_df, new_row_df], ignore_index=True)\n",
    "    level_stats_df.to_csv(f'{HSW_results_path}/{level_stats_df_name}/All Pareto Segments in History.csv', index=False)\n",
    "    level_stats_df.to_pickle(f'{HSW_results_path}/{level_stats_df_name}/All Pareto Segments in History.pkl')\n",
    "\n",
    "\n",
    "def pca_pareto_observer(history_pareto_segments_list, pca_pair):\n",
    "    global history_segments_dict_allPCA\n",
    "    global topic_name\n",
    "    global topic_number\n",
    "    global HSW_results_path\n",
    "    global HSW_results_name\n",
    "    global level_stats_df_name\n",
    "    global pca_pfs_df\n",
    "    global pca_pairs_used\n",
    "\n",
    "    # Filter the list to include only tuples where the (pca1, pca2) matches the input pca_pair\n",
    "    filtered_list = [item for item in history_pareto_segments_list if item[2] == pca_pair]\n",
    "\n",
    "    objective_1_values = [history_segments_dict_allPCA[seg_key]['fitness_score'][0] for seg_key in filtered_list if history_segments_dict_allPCA[seg_key]['fitness_score'][0] is not None]\n",
    "    objective_2_values = [history_segments_dict_allPCA[seg_key]['fitness_score'][1] for seg_key in filtered_list if history_segments_dict_allPCA[seg_key]['fitness_score'][1] is not None]\n",
    "\n",
    "    # Check if both lists are empty or filled with zeros\n",
    "    if not objective_1_values or not objective_2_values or all(value == 0 for value in objective_1_values + objective_2_values):\n",
    "        print(\"All objectives are None or 0, skipping further processes.\")\n",
    "        return  # Exit the function\n",
    "\n",
    "    non_dominated_segments = {}\n",
    "    for i1, key1 in enumerate(filtered_list):\n",
    "        if history_segments_dict_allPCA[key1]['fitness_score'] == (None, None):\n",
    "            continue  # Skip non-evaluable segments\n",
    "\n",
    "        dominated = False\n",
    "        for i2, key2 in enumerate(filtered_list):\n",
    "            if i1 != i2 and dominates(history_segments_dict_allPCA[key2]['fitness_score'], history_segments_dict_allPCA[key1]['fitness_score']):\n",
    "                dominated = True\n",
    "                break\n",
    "        if not dominated:\n",
    "            non_dominated_segments[key1] = history_segments_dict_allPCA[key1]\n",
    "\n",
    "    pareto_fitness_tuples = [(ft[0], ft[1]) for ft in [history_segments_dict_allPCA[seg_key]['fitness_score'] for seg_key in non_dominated_segments.keys()] if ft[0] is not None and ft[1] is not None]\n",
    "    # pareto_segments_tuples = [(seg_key[2], seg_key[3], history_segments_dict_allPCA[seg_key]['number_of_syn_sample'], history_segments_dict_allPCA[seg_key]['retrained_dots_list']) for seg_key in non_dominated_segments.keys()]\n",
    "\n",
    "    # Calculate statistics\n",
    "    worst_fitness = (min(objective_1_values), min(objective_2_values))\n",
    "    best_fitness = (max(objective_1_values), max(objective_2_values))\n",
    "    mean_OverallAcc = np.mean(objective_1_values)\n",
    "    mean_ClassRecall = np.mean(objective_2_values)\n",
    "\n",
    "    print(f\"All Pareto Segments in {pca_pair}, Evaluations: {len(filtered_list)}\")\n",
    "    print(f\"Best Fitness: {best_fitness}\")\n",
    "    print(f\"Worst Fitness: {worst_fitness}\")\n",
    "    print(f\"Mean Overall Accuracy: {mean_OverallAcc}\")\n",
    "    print(f\"Mean {topic_name} Recall: {mean_ClassRecall}\")\n",
    "\n",
    "    print(\"Pareto Front Selections:---------------------\")\n",
    "    for i, (key, seg) in enumerate(non_dominated_segments.items()):\n",
    "        print(f\"Segment {i+1}: {seg['coordinates']}, {seg['PCA']}, Num Samples {seg['number_of_syn_sample']}: {seg['retrained_dots_list']} \\nFitness: {seg['fitness_score']}\")\n",
    "\n",
    "        classification_df = seg['classification_df']\n",
    "        # Collect all generation data into a new DataFrame row\n",
    "        pca_PFs_row = {\n",
    "            \"topic_name\": topic_name,\n",
    "            \"topic_number\": topic_number,\n",
    "            \"PCA_index\": pca_pairs_used.index(seg['PCA']),\n",
    "            \"PCA\": seg['PCA'],\n",
    "            \"coordinates\": seg['coordinates'],\n",
    "            'fitness_score': seg['fitness_score'],\n",
    "            \"accuracy\": seg['fitness_score'][0],\n",
    "            \"topic_recall\": seg['fitness_score'][1],\n",
    "            'balanced_fitness_score': (classification_df.loc['accuracy', 'Balanced Accuracy'], classification_df.loc[topic_name, 'Balanced Accuracy']),\n",
    "            'overall_balanced_accuracy': classification_df.loc['accuracy', 'Balanced Accuracy'],\n",
    "            'topic_balanced_accuracy': classification_df.loc[topic_name, 'Balanced Accuracy'],\n",
    "            'balanced_acc_rec_score': (classification_df.loc[topic_name, 'Balanced Accuracy'], classification_df.loc[topic_name, 'recall']),\n",
    "            'topic_F1': classification_df.loc[topic_name, 'f1-score'],\n",
    "            'overall_F1': classification_df.loc['accuracy', 'f1-score'],\n",
    "            'overall_recall': classification_df.loc['accuracy', 'recall'],\n",
    "            \"retraining_time\": seg[\"retraining_time\"],\n",
    "            \"number_of_syn_sample\": seg['number_of_syn_sample'],\n",
    "            \"retrained_dots_list\": seg['retrained_dots_list'],\n",
    "            'true_labels': seg['true_labels'],\n",
    "            'predicted_labels': seg['predicted_labels'],\n",
    "            \"segment_key\": key\n",
    "        }\n",
    "        # # Transform DataFrame to dict format\n",
    "        # for idx in classification_df.index:\n",
    "        #     if idx != 'accuracy' and idx != 'macro avg' and idx != 'weighted avg':\n",
    "        #         pca_PFs_row[idx] = classification_df.loc[idx].dropna().to_dict()\n",
    "        # Convert the dictionary to a DataFrame for a single row\n",
    "        pca_PFs_row_df = pd.DataFrame([pca_PFs_row])\n",
    "        # Concatenate this new row DataFrame to the existing DataFrame\n",
    "        pca_pfs_df = pd.concat([pca_pfs_df, pca_PFs_row_df], ignore_index=True)\n",
    "        pca_pfs_df.to_csv(f'{HSW_results_path}/HSWpcaPFs_{HSW_results_name}.csv', index=False)\n",
    "        pca_pfs_df.to_pickle(f'{HSW_results_path}/HSWpcaPFs_{HSW_results_name}.pkl')\n",
    "        print('---')\n",
    "    print('------------------------')\n",
    "\n",
    "    # Plotting\n",
    "    plt.figure(figsize=(12, 7))\n",
    "    plt.scatter(objective_1_values, objective_2_values, c='blue', alpha=0.5, label='Population')\n",
    "    if pareto_fitness_tuples:\n",
    "        plt.scatter([ft[0] for ft in pareto_fitness_tuples], [ft[1] for ft in pareto_fitness_tuples], c='red', alpha=0.9, label='Pareto Front')\n",
    "    \n",
    "    plt.xlabel('Objective 1: Overall Accuracy')\n",
    "    plt.ylabel(f'Objective 2: {topic_name} Recall')\n",
    "    plt.title(f'{topic_number}, Population and Pareto Front for All Pareto Segments in {pca_pair}')\n",
    "    plt.legend()\n",
    "    pareto_plots_dir = f\"{HSW_results_path}/{level_stats_df_name}\"\n",
    "    os.makedirs(pareto_plots_dir, exist_ok=True)\n",
    "    filename = f\"All Pareto Segments in {pca_pair}.png\"\n",
    "    # plt.savefig(os.path.join(pareto_plots_dir, filename), dpi=200, bbox_inches='tight', pad_inches=0)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-09-26T01:18:24.114793Z",
     "iopub.status.busy": "2024-09-26T01:18:24.114793Z",
     "iopub.status.idle": "2024-09-26T01:18:24.171179Z",
     "shell.execute_reply": "2024-09-26T01:18:24.170639Z"
    }
   },
   "outputs": [],
   "source": [
    "def segment_retraining(data_syn, individual_Segment_dict, segment_key):\n",
    "    global dots_mode\n",
    "    global history_segments_dict\n",
    "    global X_train_r\n",
    "    global Y_train_r\n",
    "    global X_test_re\n",
    "    global Y_test_re\n",
    "    global catboost_params\n",
    "    global sum_GPU_seconds\n",
    "    global total_gpu_seconds\n",
    "    global GPU_limit\n",
    "    global X_test_re_Test\n",
    "    global Y_test_re_Test\n",
    "    global test_fold_results_df\n",
    "    global bch_m0\n",
    "    global HSW_results_path\n",
    "    global pca_pairs_used\n",
    "    global topic_name\n",
    "    global topic_number\n",
    "    global HSW_results_name\n",
    "\n",
    "    CPU_monitor_memory_usage()\n",
    "    monitor_gpu_memory()\n",
    "\n",
    "    if dots_mode == \"False\":\n",
    "        syn_original_list = individual_Segment_dict[\"red_dots_list\"]\n",
    "    elif dots_mode == \"Both\":\n",
    "        syn_original_list = individual_Segment_dict[\"red_dots_list\"] + individual_Segment_dict[\"blue_dots_list\"]\n",
    "    \n",
    "    if len(syn_original_list) == 0:\n",
    "        individual_Segment_dict['model'] = None\n",
    "        individual_Segment_dict['true_labels'] = None\n",
    "        individual_Segment_dict['predicted_labels'] = None\n",
    "        individual_Segment_dict['classification_df'] = None\n",
    "        individual_Segment_dict['fitness_score'] = (None, None)\n",
    "        individual_Segment_dict['number_of_syn_sample'] = None\n",
    "        individual_Segment_dict['retraining_time'] = None\n",
    "        individual_Segment_dict['retrained_dots_list'] = []\n",
    "        return individual_Segment_dict\n",
    "\n",
    "    for previous_segment_key, previous_Segment_dict in history_segments_dict.items():\n",
    "        if syn_original_list == previous_Segment_dict['retrained_dots_list']:\n",
    "            individual_Segment_dict['model'] = previous_Segment_dict['model']\n",
    "            individual_Segment_dict['true_labels'] = previous_Segment_dict['true_labels']\n",
    "            individual_Segment_dict['predicted_labels'] = previous_Segment_dict['predicted_labels']\n",
    "            individual_Segment_dict['classification_df'] = previous_Segment_dict['classification_df']\n",
    "            individual_Segment_dict['fitness_score'] = previous_Segment_dict['fitness_score']\n",
    "            individual_Segment_dict['number_of_syn_sample'] = previous_Segment_dict['number_of_syn_sample']\n",
    "            individual_Segment_dict['retraining_time'] = previous_Segment_dict['retraining_time']\n",
    "            individual_Segment_dict['retrained_dots_list'] = syn_original_list\n",
    "            return individual_Segment_dict\n",
    "    \n",
    "    filtered_syn_df = data_syn[data_syn['index_meta'].isin(syn_original_list)]\n",
    "\n",
    "    X_train_re = pd.concat([X_train_r, filtered_syn_df.drop(columns=['topic_name'])])\n",
    "    Y_train_re = pd.concat([Y_train_r, filtered_syn_df['topic_name']])\n",
    "\n",
    "    train_pool_re = Pool(\n",
    "        X_train_re[[\"text\", \"area_TEIS\"]],\n",
    "        Y_train_re,\n",
    "        text_features=[\"text\"],\n",
    "        cat_features=[\"area_TEIS\"]\n",
    "    )\n",
    "    valid_pool_re = Pool(\n",
    "        X_test_re[[\"text\", \"area_TEIS\"]],\n",
    "        Y_test_re,\n",
    "        text_features=[\"text\"],\n",
    "        cat_features=[\"area_TEIS\"]\n",
    "    )\n",
    "\n",
    "    catboost_params = catboost_params\n",
    "            \n",
    "    # Model Training\n",
    "    model_re = CatBoostClassifier(**catboost_params)\n",
    "    start_time = time.time()  # Start timing\n",
    "    model_re.fit(train_pool_re, eval_set=valid_pool_re)\n",
    "    training_time = time.time() - start_time  # End timing\n",
    "\n",
    "    sum_GPU_seconds += training_time\n",
    "    if sum_GPU_seconds >= total_gpu_seconds:\n",
    "        GPU_limit = True\n",
    "\n",
    "    # Save the retrain performances\n",
    "    predictions = model_re.predict(X_test_re[[\"text\", \"area_TEIS\"]])\n",
    "    accuracy = accuracy_score(Y_test_re, predictions)\n",
    "    report = classification_report(Y_test_re, predictions, digits=3, output_dict=True)\n",
    "    classification_df = classification_report_to_df(report, Y_test_re, predictions)\n",
    "    \n",
    "    fitness_score = (accuracy, classification_df.loc[segment_key[0], 'recall'])\n",
    "            \n",
    "    # Save the trained model, classification_df, and fitness_score\n",
    "    individual_Segment_dict['model'] = model_re\n",
    "    individual_Segment_dict['true_labels'] = []  # Convert to list if Y_test_re is a pandas Series or numpy array\n",
    "    individual_Segment_dict['predicted_labels'] = []  # Convert to list for consistency\n",
    "    individual_Segment_dict['classification_df'] = classification_df\n",
    "    individual_Segment_dict['fitness_score'] = fitness_score\n",
    "    individual_Segment_dict['number_of_syn_sample'] = len(filtered_syn_df)\n",
    "    individual_Segment_dict['retraining_time'] = training_time  # Save the training time\n",
    "    individual_Segment_dict['retrained_dots_list'] = syn_original_list\n",
    "\n",
    "    \"\"\"Testing results below\"\"\"\n",
    "\n",
    "    test_predictions = model_re.predict(X_test_re_Test[[\"text\", \"area_TEIS\"]])\n",
    "    test_accuracy = accuracy_score(Y_test_re_Test, test_predictions)\n",
    "    test_report = classification_report(Y_test_re_Test, test_predictions, digits=3, output_dict=True)\n",
    "    test_classification_df = classification_report_to_df(test_report, Y_test_re_Test, test_predictions)\n",
    "\n",
    "    test_new_row_index = len(test_fold_results_df)\n",
    "    test_class_DF_path = f'{HSW_results_path}/test_Class_DF'\n",
    "    os.makedirs(test_class_DF_path, exist_ok=True)\n",
    "    test_classification_df.to_csv(f'{test_class_DF_path}/test_{topic_number}_HSW_{test_new_row_index}_AllEval_ClassDF.csv', index=True)\n",
    "    test_classification_df.to_pickle(f'{test_class_DF_path}/test_{topic_number}_HSW_{test_new_row_index}_AllEval_ClassDF.pkl')\n",
    "\n",
    "    # Collect all generation data into a new DataFrame row\n",
    "    test_new_ParamCV_row = {\n",
    "        \"topic_name\": topic_name,\n",
    "        \"topic_number\": topic_number,\n",
    "        \"PCA_index\": pca_pairs_used.index(individual_Segment_dict['PCA']),\n",
    "        \"PCA\": individual_Segment_dict['PCA'],\n",
    "        \"coordinates\": individual_Segment_dict['coordinates'],\n",
    "        'fitness_score': individual_Segment_dict['fitness_score'],\n",
    "        \"accuracy\": individual_Segment_dict['fitness_score'][0],\n",
    "        \"topic_recall\": individual_Segment_dict['fitness_score'][1],\n",
    "        'balanced_fitness_score': (test_classification_df.loc['accuracy', 'Balanced Accuracy'], test_classification_df.loc[topic_name, 'Balanced Accuracy']),\n",
    "        'overall_balanced_accuracy': test_classification_df.loc['accuracy', 'Balanced Accuracy'],\n",
    "        'topic_balanced_accuracy': test_classification_df.loc[topic_name, 'Balanced Accuracy'],\n",
    "        'balanced_acc_rec_score': (test_classification_df.loc[topic_name, 'Balanced Accuracy'], test_classification_df.loc[topic_name, 'recall']),\n",
    "        'topic_F1': test_classification_df.loc[topic_name, 'f1-score'],\n",
    "        'overall_F1': test_classification_df.loc['accuracy', 'f1-score'],\n",
    "        'overall_recall': test_classification_df.loc['accuracy', 'recall'],\n",
    "        \"retraining_time\": individual_Segment_dict[\"retraining_time\"],\n",
    "        \"number_of_syn_sample\": individual_Segment_dict['number_of_syn_sample'],\n",
    "        \"retrained_dots_list\": individual_Segment_dict['retrained_dots_list'],\n",
    "        'true_labels': individual_Segment_dict['true_labels'],\n",
    "        'predicted_labels': individual_Segment_dict['predicted_labels'],\n",
    "        \"segment_key\": (topic_name, topic_number, individual_Segment_dict['PCA'], individual_Segment_dict['coordinates'], individual_Segment_dict[\"sliding_window_level\"]),\n",
    "        'classDF_path': f'{test_class_DF_path}/test_{topic_number}_HSW_{test_new_row_index}_AllEval_ClassDF.csv',\n",
    "        'T13_TBA_Imp': test_classification_df.loc['Humanitarian aid for Ukraine.', 'Balanced Accuracy'] - bch_m0.loc['Humanitarian aid for Ukraine.', 'Balanced Accuracy'],\n",
    "        'T13_TR_Imp': classification_df.loc['Humanitarian aid for Ukraine.', 'recall'] - bch_m0.loc['Humanitarian aid for Ukraine.', 'recall']\n",
    "    }\n",
    "    # # Transform DataFrame to dict format\n",
    "    # for idx in classification_df.index:\n",
    "    #     if idx != 'accuracy' and idx != 'macro avg' and idx != 'weighted avg':\n",
    "    #         new_ParamCV_row[idx] = classification_df.loc[idx].dropna().to_dict()\n",
    "\n",
    "\n",
    "    # Convert the dictionary to a DataFrame for a single row\n",
    "    test_new_ParamCV_row_df = pd.DataFrame([test_new_ParamCV_row])\n",
    "    # Concatenate this new row DataFrame to the existing DataFrame\n",
    "    test_fold_results_df = pd.concat([test_fold_results_df, test_new_ParamCV_row_df], ignore_index=True)\n",
    "    test_fold_results_df.to_csv(f'{HSW_results_path}/test_HSWAllSegs_{HSW_results_name}.csv', index=False)\n",
    "    test_fold_results_df.to_pickle(f'{HSW_results_path}/test_HSWAllSegs_{HSW_results_name}.pkl')\n",
    "    \n",
    "\n",
    "    return individual_Segment_dict\n",
    "\n",
    "\n",
    "def sliding_window_get_dots(history_segments_dict, prev_segment_key, level, tr_df, data_syn, topic_name, topic_number, area_size_x, area_size_y, level_x_start, level_y_start, pca_pairs = [(\"PCA_0\", \"PCA_1\")], num_segments=16, window_size=1):\n",
    "    global sw_level\n",
    "    global HSW_results_path\n",
    "    global history_dict_name\n",
    "    global GPU_limit\n",
    "\n",
    "    for pca1, pca2 in pca_pairs:\n",
    "        if GPU_limit == True:\n",
    "            return history_segments_dict\n",
    "        # Separate data points by their classification status\n",
    "        grey_mask = tr_df['topic_name'] != topic_name\n",
    "        blue_mask = (tr_df['topic_name'] == topic_name) & (tr_df['pred_topic_name'] == topic_name)\n",
    "        red_mask = (tr_df['topic_name'] == topic_name) & (tr_df['pred_topic_name'] != topic_name)\n",
    "\n",
    "        if area_size_x == \"FullSize\":\n",
    "            segment_size_x = (tr_df[pca1].max() - tr_df[pca1].min()) / num_segments\n",
    "        else:\n",
    "            segment_size_x = (area_size_x[1] - area_size_x[0]) / num_segments\n",
    "        if area_size_y == \"FullSize\":\n",
    "            segment_size_y = (tr_df[pca2].max() - tr_df[pca2].min()) / num_segments\n",
    "        else:\n",
    "            segment_size_y = (area_size_y[1] - area_size_y[0]) / num_segments\n",
    "\n",
    "        # Sliding window through the plot area\n",
    "        for i in range(num_segments - window_size + 1):\n",
    "            for j in range(num_segments - window_size + 1):\n",
    "                if GPU_limit == True:\n",
    "                    return history_segments_dict\n",
    "                individual_Segment_dict = {}\n",
    "                \n",
    "                if level_x_start == \"minimum\":\n",
    "                    x_start = tr_df[pca1].min() + i * segment_size_x\n",
    "                else:\n",
    "                    x_start = level_x_start + i * segment_size_x\n",
    "                if level_y_start == \"minimum\":\n",
    "                    y_start = tr_df[pca2].min() + j * segment_size_y\n",
    "                else:\n",
    "                    y_start = level_y_start + j * segment_size_y\n",
    "                \n",
    "                x_end = x_start + window_size * segment_size_x\n",
    "                y_end = y_start + window_size * segment_size_y\n",
    "\n",
    "                coordinates = f\"{x_start}-{x_end}, {y_start}-{y_end}\"\n",
    "                segment_key = (topic_name, topic_number, (pca1, pca2), coordinates, level, prev_segment_key)\n",
    "\n",
    "                # Create a new key for checking existing entries, excluding the 'prev_segment_key' part\n",
    "                check_key = (topic_name, topic_number, (pca1, pca2), coordinates, level)\n",
    "\n",
    "                # Check if a segment with these specific details already exists\n",
    "                if any(key[:-1] == check_key for key in history_segments_dict.keys()):\n",
    "                    continue  # Skip this iteration if a match is found\n",
    "\n",
    "                segment_mask = (tr_df[pca1] >= x_start) & (tr_df[pca1] <= x_end) & (tr_df[pca2] >= y_start) & (tr_df[pca2] <= y_end)\n",
    "                segment_red_mask = segment_mask & red_mask\n",
    "                segment_blue_mask = segment_mask & blue_mask\n",
    "                segment_grey_mask =  segment_mask & grey_mask\n",
    "                \n",
    "                individual_Segment_dict[\"coordinates\"] = coordinates\n",
    "                individual_Segment_dict[\"PCA\"] = (pca1, pca2)\n",
    "                individual_Segment_dict[\"blue_dots_list\"] = tr_df.loc[segment_blue_mask, 'index_meta'].tolist()\n",
    "                individual_Segment_dict[\"red_dots_list\"] = tr_df.loc[segment_red_mask, 'index_meta'].tolist()\n",
    "                individual_Segment_dict[\"grey_dots_list\"] = tr_df.loc[segment_grey_mask, 'index_meta'].tolist()\n",
    "                individual_Segment_dict[\"sliding_window_level\"] = level\n",
    "\n",
    "                individual_Segment_dict = segment_retraining(data_syn, individual_Segment_dict, segment_key)\n",
    "\n",
    "                if individual_Segment_dict['fitness_score'] != (None, None):\n",
    "                    history_segments_dict[segment_key] = individual_Segment_dict\n",
    "                    print(individual_Segment_dict['fitness_score'])\n",
    "                    \n",
    "                    # os.makedirs(f'{HSW_results_path}/{history_dict_name}', exist_ok=True)\n",
    "                    # with open(f'{HSW_results_path}/{history_dict_name}/{pca1}{pca2}.pkl', 'wb') as file:\n",
    "                    #     pickle.dump(history_segments_dict, file)\n",
    "    \n",
    "        print(f'Finished hierarchical sliding window for {topic_name}_{pca1}_{pca2}.')\n",
    "    return history_segments_dict\n",
    "\n",
    "\n",
    "def hierarchical_sliding_window_retrain(history_segments_dict, train_PCA_YZ_df, data_syn, topic_name, topic_number, level_0_area_size_x, level_0_area_size_y, level_0_x_start,  level_0_y_start, level_0_pca_pairs, levels_num_segments_list, levels_window_size_list):\n",
    "    global sw_level\n",
    "    global GPU_limit\n",
    "    sw_level = 0\n",
    "\n",
    "    def recursive_segmentation(history_dict, level):\n",
    "        global GPU_limit\n",
    "        if GPU_limit == True:\n",
    "            print('Terminating due to GPU training time')\n",
    "            return\n",
    "        \n",
    "        all_current_segments = get_current_level_segments(history_dict, level)\n",
    "        if not all_current_segments:\n",
    "            print('out2')\n",
    "            return\n",
    "        \n",
    "        if level >= len(levels_num_segments_list):\n",
    "            print('out1')\n",
    "            return\n",
    "\n",
    "        for prev_prev_key, current_segments in all_current_segments.items():\n",
    "            new_history_dicts = []\n",
    "            for prev_segment_key, segment_dict in current_segments.items():\n",
    "                if segment_dict['fitness_score'] == (None, None):\n",
    "                    continue\n",
    "\n",
    "                coordinates = prev_segment_key[3]\n",
    "                x_start, x_end, y_start, y_end = extract_coordinates(coordinates)\n",
    "                pca1, pca2 = prev_segment_key[2]\n",
    "\n",
    "                new_history_dict = sliding_window_get_dots(\n",
    "                    history_dict, prev_segment_key, level, train_PCA_YZ_df, data_syn, topic_name, topic_number,\n",
    "                    (x_start, x_end), (y_start, y_end), x_start, y_start,\n",
    "                    pca_pairs=[(pca1, pca2)],\n",
    "                    num_segments=levels_num_segments_list[level],\n",
    "                    window_size=levels_window_size_list[level]\n",
    "                )\n",
    "                new_history_dicts.append(new_history_dict)\n",
    "\n",
    "            if GPU_limit == True:\n",
    "                print('Terminating due to GPU training time')\n",
    "                return\n",
    "            # print(new_history_dicts)\n",
    "            for new_history_dict in new_history_dicts:\n",
    "                recursive_segmentation(new_history_dict, level + 1)\n",
    "                if GPU_limit == True:\n",
    "                    print('Terminating due to GPU training time')\n",
    "                    return\n",
    "\n",
    "    prev_segment_key = (topic_name, topic_number, (level_0_pca_pairs[0][0], level_0_pca_pairs[0][1]), None, None, None)\n",
    "    history_segments_dict = sliding_window_get_dots(\n",
    "        history_segments_dict, prev_segment_key, sw_level, train_PCA_YZ_df, data_syn, topic_name, topic_number,\n",
    "        level_0_area_size_x, level_0_area_size_y, level_0_x_start, level_0_y_start,\n",
    "        pca_pairs=level_0_pca_pairs,\n",
    "        num_segments=levels_num_segments_list[sw_level],\n",
    "        window_size=levels_window_size_list[sw_level]\n",
    "    )\n",
    "\n",
    "    if GPU_limit == True:\n",
    "        print('Terminating due to GPU training time')\n",
    "        return history_segments_dict\n",
    "\n",
    "    recursive_segmentation(history_segments_dict, sw_level + 1)\n",
    "    return history_segments_dict\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-09-26T01:18:24.175211Z",
     "iopub.status.busy": "2024-09-26T01:18:24.174216Z",
     "iopub.status.idle": "2024-09-26T01:18:24.202239Z",
     "shell.execute_reply": "2024-09-26T01:18:24.201238Z"
    }
   },
   "outputs": [],
   "source": [
    "def dominates(score1, score2):\n",
    "    \"\"\"\n",
    "    Determines if one score dominates another.\n",
    "    A score1 dominates score2 if it is better in all the objectives or equal in some and better in at least one.\n",
    "    \"\"\"\n",
    "    return (score1[0] > score2[0] and score1[1] >= score2[1]) or (score1[0] >= score2[0] and score1[1] > score2[1])\n",
    "\n",
    "def find_pareto_front(df):\n",
    "    \"\"\"\n",
    "    Marks rows as 'Yes' if they are on the Pareto front, 'No' otherwise.\n",
    "    \"\"\"\n",
    "    df = df.copy()  # Copy DataFrame to avoid modifying the original\n",
    "    df['Pareto'] = 'No'  # Initialize the Pareto column with 'No'\n",
    "    \n",
    "    scores = df['balanced_acc_rec_score'].tolist()\n",
    "    is_pareto = np.ones(len(scores), dtype=bool)  # Initialize all as True\n",
    "    \n",
    "    for i1 in range(len(scores)):\n",
    "        for i2 in range(len(scores)):\n",
    "            if i1 != i2 and dominates(scores[i2], scores[i1]):\n",
    "                is_pareto[i1] = False\n",
    "                break\n",
    "\n",
    "    # Update the 'Pareto' column based on the Pareto front\n",
    "    df.loc[is_pareto, 'Pareto'] = 'Yes'\n",
    "    \n",
    "    return df\n",
    "\n",
    "def find_best_values(df):\n",
    "    # Identify the maximum values for each specified column\n",
    "    max_values = {\n",
    "        'accuracy': df['accuracy'].max(),\n",
    "        'topic_recall': df['topic_recall'].max(),\n",
    "        'overall_balanced_accuracy': df['overall_balanced_accuracy'].max(),\n",
    "        'topic_balanced_accuracy': df['topic_balanced_accuracy'].max(),\n",
    "        'topic_F1': df['topic_F1'].max(),\n",
    "        'overall_F1': df['overall_F1'].max(),\n",
    "        'overall_recall': df['overall_recall'].max()\n",
    "    }\n",
    "    \n",
    "    # Function to apply to each row to determine the best columns\n",
    "    def check_best(row):\n",
    "        return [col for col, max_val in max_values.items() if row[col] == max_val]\n",
    "\n",
    "    # Apply the function to each row\n",
    "    df['best'] = df.apply(check_best, axis=1)\n",
    "    \n",
    "    return df\n",
    "\n",
    "def post_process(df, bch_class_df):\n",
    "    global topic_number\n",
    "    topic_name = topic_dict[topic_number]\n",
    "\n",
    "    bch_topic_recall = bch_class_df.loc[topic_name, 'recall']\n",
    "    bch_topic_balanced_accuracy = bch_class_df.loc[topic_name, 'Balanced Accuracy']\n",
    "    bch_overall_balanced_accuracy = bch_class_df.loc['accuracy', 'Balanced Accuracy']\n",
    "    bch_overall_F1_score = bch_class_df.loc['accuracy', 'f1-score']\n",
    "\n",
    "    # Calculate improvements\n",
    "    df['imp_topic_recall'] = df['topic_recall'] - bch_topic_recall\n",
    "    df['imp_topic_balanced_accuracy'] = df['topic_balanced_accuracy'] - bch_topic_balanced_accuracy\n",
    "    df['imp_overall_balanced_accuracy'] = df['overall_balanced_accuracy'] - bch_overall_balanced_accuracy\n",
    "    df['imp_overall_F1'] = df['overall_F1'] - bch_overall_F1_score\n",
    "\n",
    "    # Calculate cumulative retraining_time\n",
    "    df['cumulative_time'] = df['retraining_time'].cumsum()\n",
    "\n",
    "    # Calculate max and average improvements\n",
    "    df['max_topic_recall_imp'] = df[['imp_topic_recall']].max(axis=1).cummax()\n",
    "    df['average_topic_recall_imp'] = df[['imp_topic_recall']].mean(axis=1).expanding().mean()\n",
    "\n",
    "    df['max_topic_balanced_acc_imp'] = df[['imp_topic_balanced_accuracy']].max(axis=1).cummax()\n",
    "    df['average_topic_balanced_acc_imp'] = df[['imp_topic_balanced_accuracy']].mean(axis=1).expanding().mean()\n",
    "\n",
    "    df['max_overall_balanced_acc_imp'] = df[['imp_overall_balanced_accuracy']].max(axis=1).cummax()\n",
    "    df['average_overall_balanced_acc_imp'] = df[['imp_overall_balanced_accuracy']].mean(axis=1).expanding().mean()\n",
    "\n",
    "    df['max_overall_F1_improvement'] = df[['imp_overall_F1']].max(axis=1).cummax()\n",
    "    df['average_overall_F1_improvement'] = df[['imp_overall_F1']].mean(axis=1).expanding().mean()\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-09-26T01:18:24.206234Z",
     "iopub.status.busy": "2024-09-26T01:18:24.205239Z",
     "iopub.status.idle": "2024-09-26T01:18:24.233465Z",
     "shell.execute_reply": "2024-09-26T01:18:24.232356Z"
    }
   },
   "outputs": [],
   "source": [
    "def bch_classification_report_to_df(report, y_true, y_pred):\n",
    "    global bch_class_df\n",
    "    global topic_dict\n",
    "    df = pd.DataFrame(report).transpose()\n",
    "\n",
    "    # Calculate the confusion matrix\n",
    "    labels = df.index[:-3]  # Exclude 'accuracy', 'macro avg', 'weighted avg'\n",
    "    # Calculate the confusion matrix\n",
    "    cm = confusion_matrix(y_true, y_pred, labels=labels)\n",
    "\n",
    "    # Extracting TP, FP, TN, FN for each class\n",
    "    TP = cm.diagonal()\n",
    "    FP = cm.sum(axis=0) - TP\n",
    "    FN = cm.sum(axis=1) - TP\n",
    "    TN = cm.sum() - (FP + FN + TP)\n",
    "\n",
    "    sens = sum(TP) / (sum(TP)+sum(FN))\n",
    "    spec = sum(TN) / (sum(TN)+sum(FP))\n",
    "    \n",
    "    # Calculate Sensitivity (same as recall)\n",
    "    df['Sensitivity'] = df['recall']\n",
    "    \n",
    "    # Calculate Specificity\n",
    "    tn = cm.sum() - (cm.sum(axis=0) + cm.sum(axis=1) - np.diag(cm))\n",
    "    fp = cm.sum(axis=0) - np.diag(cm)\n",
    "    specificity = tn / (tn + fp)\n",
    "    \n",
    "    # Assign computed specificity to dataframe except for the last three rows\n",
    "    df.loc[df.index[:-3], 'Specificity'] = specificity\n",
    "    \n",
    "    # Handling special cases\n",
    "    # Set 'accuracy' row sensitivity and specificity to the accuracy value\n",
    "    accuracy = df.loc['accuracy', 'precision']  # assuming 'precision' contains the accuracy\n",
    "    df.loc['accuracy', ['Sensitivity', 'Specificity']] = sens, spec\n",
    "    \n",
    "    # Calculate 'macro avg' and 'weighted avg' for sensitivity and specificity\n",
    "    df.loc['macro avg', 'Sensitivity'] = df.iloc[:-3]['Sensitivity'].mean()\n",
    "    df.loc['weighted avg', 'Sensitivity'] = np.average(df.iloc[:-3]['Sensitivity'], weights=df.iloc[:-3]['support'])\n",
    "    \n",
    "    df.loc['macro avg', 'Specificity'] = df.iloc[:-3]['Specificity'].mean()\n",
    "    df.loc['weighted avg', 'Specificity'] = np.average(df.iloc[:-3]['Specificity'], weights=df.iloc[:-3]['support'])\n",
    "\n",
    "    # Calculate Balanced Accuracy for each row, including special averages\n",
    "    df['Balanced Accuracy'] = (df['Sensitivity'] + df['Specificity']) / 2\n",
    "    \n",
    "    return df\n",
    "\n",
    "def train_bch(X_train_re, X_test_re, Y_train_re, Y_test_re, catboost_params, itr0_path):\n",
    "    global X_test_re_Test\n",
    "    global Y_test_re_Test\n",
    "    CPU_monitor_memory_usage()\n",
    "    monitor_gpu_memory()\n",
    "    bch_dict = {}\n",
    "\n",
    "    train_pool_re = Pool(\n",
    "        X_train_re[[\"text\", \"area_TEIS\"]],\n",
    "        Y_train_re,\n",
    "        text_features=[\"text\"],\n",
    "        cat_features=[\"area_TEIS\"]\n",
    "    )\n",
    "    valid_pool_re = Pool(\n",
    "        X_test_re[[\"text\", \"area_TEIS\"]],\n",
    "        Y_test_re,\n",
    "        text_features=[\"text\"],\n",
    "        cat_features=[\"area_TEIS\"]\n",
    "    )\n",
    "\n",
    "    # Model Training\n",
    "    model_re = CatBoostClassifier(**catboost_params)\n",
    "    start_time = time.time()  # Start timing\n",
    "    model_re.fit(train_pool_re, eval_set=valid_pool_re)\n",
    "    training_time = time.time() - start_time  # End timing\n",
    "\n",
    "    # Save the retrain performances\n",
    "    val_predictions = model_re.predict(X_test_re[[\"text\", \"area_TEIS\"]])\n",
    "    val_accuracy = accuracy_score(Y_test_re, val_predictions)\n",
    "    val_report = classification_report(Y_test_re, val_predictions, digits=3, output_dict=True)\n",
    "    print(val_accuracy)\n",
    "    # print(report)\n",
    "    val_classification_df = bch_classification_report_to_df(val_report, Y_test_re, val_predictions)\n",
    "    # print(classification_df)\n",
    "    val_classification_df.to_pickle(f\"{itr0_path}/Validation_Benchmark_M0_Classdf_0.pkl\")\n",
    "    val_classification_df.to_csv(f\"{itr0_path}/Validation_Benchmark_M0_Classdf_0.csv\", index=True)\n",
    "\n",
    "    # Save the retrain performances\n",
    "    predictions = model_re.predict(X_test_re_Test[[\"text\", \"area_TEIS\"]])\n",
    "    accuracy = accuracy_score(Y_test_re_Test, predictions)\n",
    "    report = classification_report(Y_test_re_Test, predictions, digits=3, output_dict=True)\n",
    "    print(accuracy)\n",
    "    # print(report)\n",
    "    classification_df = bch_classification_report_to_df(report, Y_test_re_Test, predictions)\n",
    "    # print(classification_df)\n",
    "\n",
    "    classification_df.to_pickle(f\"{itr0_path}/Benchmark_M0_Classdf_0.pkl\")\n",
    "    classification_df.to_csv(f\"{itr0_path}/Benchmark_M0_Classdf_0.csv\", index=True)\n",
    "\n",
    "    bch_dict['model'] = model_re\n",
    "    bch_dict['classification_df'] = classification_df\n",
    "    bch_dict['accuracy'] = accuracy\n",
    "    bch_dict['retraining_time'] = training_time\n",
    "\n",
    "    return bch_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-09-26T01:18:24.237302Z",
     "iopub.status.busy": "2024-09-26T01:18:24.237302Z",
     "iopub.status.idle": "2024-09-26T02:24:12.865408Z",
     "shell.execute_reply": "2024-09-26T02:24:12.863409Z"
    }
   },
   "outputs": [],
   "source": [
    "if __name__ == \"__main__\":\n",
    "    run = 1\n",
    "    rand = 10\n",
    "    \n",
    "    run_path = f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}\"\n",
    "    itr0_path = f\"{run_path}/Iteration_0\"\n",
    "    os.makedirs(itr0_path, exist_ok=True)\n",
    "\n",
    "    # Load Data\n",
    "    data = pd.read_csv(f'D:/AutoGeTS/Data/tickets_topics.csv',lineterminator='\\n')\n",
    "    data_topic = data.dropna().reset_index()\n",
    "    data_topic = data_topic.rename(columns={'index': 'index_meta'})\n",
    "\n",
    "    X_train_r_both, X_test_re_Test, Y_train_r_both, Y_test_re_Test = train_test_split(data_topic, data_topic.topic_name, test_size = 0.2,random_state = 42)\n",
    "        \n",
    "    # Further split the training set to create a validation set\n",
    "    X_train_r, X_test_re, Y_train_r, Y_test_re = train_test_split(\n",
    "        X_train_r_both, \n",
    "        Y_train_r_both, \n",
    "        test_size=0.2,  # 20% of the initial training set, which is 16% of the original data\n",
    "        random_state=rand\n",
    "    )\n",
    "\n",
    "    catboost_params = {'iterations': 300, 'learning_rate': 0.2, 'depth': 8, 'l2_leaf_reg': 1, \n",
    "                        'bagging_temperature': 1, 'random_strength': 1, 'border_count': 254, \n",
    "                        'eval_metric': 'TotalF1', 'task_type': 'GPU', 'early_stopping_rounds': 20, 'use_best_model': True, 'verbose': 0, 'random_seed': rand}\n",
    "\n",
    "    topic_dict = {\"T1\": \"IT support and assistance.\",\"T2\": \"Account activation and access issues.\",\"T3\": \"Password and device security.\",\n",
    "                    \"T4\": \"Printer issues and troubleshooting.\",\"T5\": \"HP Dock connectivity issues.\",\"T6\": \"Employee documentation and errors.\",\n",
    "                    \"T7\": \"\\\"Access and login issues\\\"\",\"T8\": \"Opening and managing files/devices.\",\"T9\": \"Mobile email and VPN setup.\",\n",
    "                    \"T10\": \"IT support and communication.\",\"T11\": \"Error handling in RPG programming.\",\"T12\": \"Email security and attachments.\",\n",
    "                    \"T13\": \"Humanitarian aid for Ukraine.\",\"T14\": \"Internet connectivity issues in offices.\",\"T15\": \"Improving integration with Infojobs.\"}\n",
    "\n",
    "    bch_dict = train_bch(X_train_r, X_test_re, Y_train_r, Y_test_re, catboost_params, itr0_path)\n",
    "    for iteration in [3]:\n",
    "        if iteration == 3:\n",
    "            topic_number = \"T4\"\n",
    "        # for topic_number in [\"T13\"]: # \"T1\", \"T2\", \"T4\", \"T5\", \"T10\", \"T13\", \"T14\" \n",
    "        # iteration = 2\n",
    "        prev_itr = iteration - 1\n",
    "\n",
    "        gpu_hours = 1\n",
    "\n",
    "        if topic_number in [\"T1\", \"T5\", \"T10\"]:\n",
    "            metric_name = \"overall_f1-score\" # \"recall\", \"Balanced Accuracy\", \"overall_balanced_accuracy\", \"overall_f1-score\"\n",
    "        elif topic_number in [\"T2\"]:\n",
    "            metric_name = \"recall\"\n",
    "        elif topic_number in [\"T13\"]:\n",
    "            metric_name = \"Balanced Accuracy\"\n",
    "        elif topic_number in [\"T4\", \"T14\"]:\n",
    "            metric_name = \"overall_balanced_accuracy\"\n",
    "\n",
    "        HSW_results_path = f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_{iteration}/{topic_number}_{metric_name}_GPU{gpu_hours}h_HSW\"\n",
    "        os.makedirs(HSW_results_path, exist_ok=True)\n",
    "        \n",
    "        \"\"\"Results region\"\"\"\n",
    "        fold_results_df = pd.DataFrame() # pd.read_pickle(\"D:/AutoGeTS/Topic_Experiments/Hierarchical-SW_Results/HSWAllSegs_HSW_T13_ModeBoth_as=T,o,p,i,c,M,i,n,M,a,x_ns=8,4,2_ws=h,a,l,f_PCA_0PCA_1_PCA_18PCA_19.pkl\") # pd.DataFrame()  # DataFrame to collect aggregated results\n",
    "        fold_pfs_df = pd.DataFrame()  # DataFrame to collect details from each fold\n",
    "        pca_pfs_df = pd.DataFrame() # pd.read_pickle(\"D:/AutoGeTS/Topic_Experiments/Hierarchical-SW_Results/HSWpcaPFs_HSW_T13_ModeBoth_as=T,o,p,i,c,M,i,n,M,a,x_ns=8,4,2_ws=h,a,l,f_PCA_0PCA_1_PCA_18PCA_19.pkl\") # pd.DataFrame()\n",
    "        test_fold_results_df = pd.DataFrame()\n",
    "\n",
    "        history_segments_dict_allPCA = {}\n",
    "        history_pareto_segments_list = []\n",
    "        level_stats_df_allPCA = pd.DataFrame(columns=[\"PCA\", \"Level\", \"Previous Key\", \"Number of Evaluations\", \"Best Fitness\", \"Worst Fitness\", \"Mean Overall Accuracy\", \"Mean Topic Recall\", \"Pareto Front Segments\"])\n",
    "\n",
    "        if iteration > 1:\n",
    "            bch_class_df = pd.read_pickle(f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_{prev_itr}/Bch_Itr_{prev_itr}.pkl\")\n",
    "            bch_filtered_columns = [col for col in bch_class_df.columns if not col.startswith(\"Diff\") and col != \"Accuracy\"]\n",
    "            bch_class_df = bch_class_df[bch_filtered_columns]\n",
    "        else:\n",
    "            bch_class_df = pd.read_pickle(f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_0/Benchmark_M0_Classdf_0.pkl\")\n",
    "\n",
    "        bch_m0 = pd.read_pickle(f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_0/Benchmark_M0_Classdf_0.pkl\")\n",
    "        \n",
    "        if iteration > 1:\n",
    "            prev_itr_X_train_re = pd.read_pickle(f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_{prev_itr}/X_train_re_itr_{prev_itr}.pkl\")\n",
    "            X_train_r = prev_itr_X_train_re\n",
    "            prev_itr_Y_train_re = pd.read_pickle(f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_{prev_itr}/Y_train_re_itr_{prev_itr}.pkl\")\n",
    "            Y_train_r = prev_itr_Y_train_re\n",
    "\n",
    "        # Get the set of 'index_meta' values from X_train_r\n",
    "        index_meta_values = X_train_r['index_meta'].unique()\n",
    "        train_PCA_YZ_df = pd.read_pickle(\"D:/AutoGeTS/Data/Train_PCA_YZ_withPred_0.pkl\")\n",
    "        train_PCA_YZ_df = train_PCA_YZ_df[train_PCA_YZ_df['index_meta'].isin(index_meta_values)]\n",
    "        # train_PCA_YZ_df = train_PCA_YZ_df.rename(columns={'index': 'index_meta'})\n",
    "        pca_columns = [col for col in train_PCA_YZ_df.columns if 'PCA_' in col]\n",
    "        pca_pairs = list(itertools.combinations(pca_columns, 2))\n",
    "\n",
    "        \"\"\"Parameters and Input Section ----------------\"\"\"\n",
    "        # topic_number = \"T13\"\n",
    "        syn_number = 1\n",
    "\n",
    "        dots_mode = \"Both\"\n",
    "\n",
    "        total_gpu_seconds = gpu_hours * 60 * 60\n",
    "        \n",
    "        # Added synthetic data path\n",
    "        if topic_number in [\"T1\", \"T2\"]:\n",
    "            data_syn_raw = pd.read_pickle(f'D:/AutoGeTS/Synthetic_Data/{topic_number}-synthesis-{syn_number}.pkl')\n",
    "        else:\n",
    "            data_syn_raw = pd.read_csv(f'D:/AutoGeTS/Synthetic_Data/{topic_number}-synthesis-{syn_number}.csv',lineterminator='\\n')\n",
    "        data_syn = data_syn_raw[[\"index_meta\", \"text\", \"area_TEIS\", 'topic_name', \"sample\"]].dropna()\n",
    "\n",
    "        hsw_param_dict = {'area_size': \"TopicMinMax\",\n",
    "                    'num_segments': [8, 4, 2],\n",
    "                    'window_size': \"half\"\n",
    "                    }\n",
    "\n",
    "        # level_0_pca_pairs = pca_pairs # pca_pairs, [(\"PCA_1\", \"PCA_13\")]\n",
    "        pca_pairs_used = pca_pairs\n",
    "\n",
    "        history_dict_name = f\"HSW-Retrain-Dict_{topic_number}_Mode{dots_mode}_{dict_to_foldername(hsw_param_dict)}_{''.join(item for item in pca_pairs_used[0])}_{''.join(item for item in pca_pairs_used[-1])}\"\n",
    "        level_stats_df_name = f\"HSW-Level-Stats_{topic_number}_Mode{dots_mode}_{dict_to_foldername(hsw_param_dict)}_{''.join(item for item in pca_pairs_used[0])}_{''.join(item for item in pca_pairs_used[-1])}\"\n",
    "        HSW_results_name = f\"HSW_{topic_number}_Mode{dots_mode}_{dict_to_foldername(hsw_param_dict)}_{''.join(item for item in pca_pairs_used[0])}_{''.join(item for item in pca_pairs_used[-1])}\"\n",
    "\n",
    "        \"\"\"------------------------------------------------\"\"\"\n",
    "        topic_name = topic_dict[topic_number]\n",
    "        clean_topic_name = clean_folder_name(topic_name)\n",
    "\n",
    "        sum_GPU_seconds = 0\n",
    "        GPU_limit = False\n",
    "\n",
    "        for pca_i, pca_pair in enumerate(pca_pairs):\n",
    "            # if pca_i <= 153:\n",
    "            #     continue\n",
    "            print(f\"Pair {pca_i}\", pca_pair)\n",
    "            CPU_monitor_memory_usage()\n",
    "            monitor_gpu_memory()\n",
    "\n",
    "            level_0_pca_pairs = [pca_pair]\n",
    "\n",
    "            history_segments_dict = {}\n",
    "            level_stats_df = pd.DataFrame(columns=[\"PCA\", \"Level\", \"Previous Key\", \"Number of Evaluations\", \"Best Fitness\", \"Worst Fitness\", \"Mean Overall Accuracy\", \"Mean Topic Recall\", \"Pareto Front Segments\"])\n",
    "\n",
    "            if hsw_param_dict['area_size'] == \"FullSize\":\n",
    "                level_0_area_size_x = \"FullSize\"\n",
    "                level_0_area_size_y = \"FullSize\"\n",
    "                level_0_x_start = \"minimum\"\n",
    "                level_0_y_start = \"minimum\"\n",
    "            elif hsw_param_dict['area_size'] == \"TopicMinMax\":\n",
    "                filtered_df = train_PCA_YZ_df[train_PCA_YZ_df['topic_name'] == topic_name]\n",
    "                level_0_area_size_x =(filtered_df[pca_pair[0]].min()*1.01, filtered_df[pca_pair[0]].max()*1.01)\n",
    "                level_0_area_size_y =(filtered_df[pca_pair[1]].min()*1.01, filtered_df[pca_pair[1]].max()*1.01)\n",
    "                level_0_x_start = level_0_area_size_x[0] \n",
    "                level_0_y_start = level_0_area_size_y[0]\n",
    "            else:\n",
    "                level_0_area_size_x = hsw_param_dict['area_size']\n",
    "                level_0_area_size_y = hsw_param_dict['area_size']\n",
    "                level_0_x_start = level_0_area_size_x[0]\n",
    "                level_0_y_start = level_0_area_size_y[0]\n",
    "            levels_num_segments_list = hsw_param_dict['num_segments']\n",
    "            if hsw_param_dict['window_size'] == \"half\":\n",
    "                levels_window_size_list = [int(item / 2) for item in levels_num_segments_list]\n",
    "            else:\n",
    "                levels_window_size_list = [int(hsw_param_dict['window_size']) for item in levels_num_segments_list]\n",
    "\n",
    "            history_segments_dict = hierarchical_sliding_window_retrain(history_segments_dict, train_PCA_YZ_df, data_syn, topic_name, topic_number, level_0_area_size_x, level_0_area_size_y, level_0_x_start,  level_0_y_start, level_0_pca_pairs, levels_num_segments_list, levels_window_size_list)\n",
    "            history_segments_dict_allPCA.update(history_segments_dict)\n",
    "            # with open(f'{HSW_results_path}/{history_dict_name}_AllPCADictsList.pkl', 'wb') as file:\n",
    "            #     pickle.dump(history_segments_dict_allPCA, file)\n",
    "            \n",
    "            level_stats_df_allPCA = pd.concat([level_stats_df_allPCA, level_stats_df], axis=0, ignore_index=True)\n",
    "            level_stats_df_allPCA.to_csv(f'{HSW_results_path}/{level_stats_df_name}.csv', index=False)\n",
    "            level_stats_df_allPCA.to_pickle(f'{HSW_results_path}/{level_stats_df_name}.pkl')\n",
    "\n",
    "            pca_pareto_observer(history_pareto_segments_list, pca_pair)\n",
    "\n",
    "            if GPU_limit == True:\n",
    "                fold_results_df  = find_pareto_front(fold_results_df)\n",
    "                fold_results_df = find_best_values(fold_results_df)\n",
    "                fold_results_df = post_process(fold_results_df, bch_class_df)\n",
    "                fold_results_df.to_csv(f'{HSW_results_path}/HSWAllSegs_{HSW_results_name}.csv', index=False)\n",
    "                fold_results_df.to_pickle(f'{HSW_results_path}/HSWAllSegs_{HSW_results_name}.pkl')\n",
    "                # break\n",
    "                test_fold_results_df  = find_pareto_front(test_fold_results_df)\n",
    "                test_fold_results_df = find_best_values(test_fold_results_df)\n",
    "                test_fold_results_df = post_process(test_fold_results_df, bch_class_df)\n",
    "                test_fold_results_df.to_csv(f'{HSW_results_path}/test_HSWAllSegs_{HSW_results_name}.csv', index=True)\n",
    "                test_fold_results_df.to_pickle(f'{HSW_results_path}/test_HSWAllSegs_{HSW_results_name}.pkl')\n",
    "                break\n",
    "        \n",
    "        \"\"\"Extract best model and append synthetics\"\"\"\n",
    "        # Find the index of the row with the largest value in the 'max_overall_balanced_acc_imp' column\n",
    "        index_of_max_imp = test_fold_results_df['imp_overall_balanced_accuracy'].idxmax()\n",
    "        print(index_of_max_imp)\n",
    "\n",
    "        # Retrieve the row corresponding to this index\n",
    "        row_with_largest_value = test_fold_results_df.loc[index_of_max_imp]\n",
    "\n",
    "        filtered_syn_df = data_syn[data_syn['index_meta'].isin(row_with_largest_value['retrained_dots_list'])]\n",
    "\n",
    "        X_train_re = pd.concat([X_train_r, filtered_syn_df.drop(columns=['topic_name'])])\n",
    "        Y_train_re = pd.concat([Y_train_r, filtered_syn_df['topic_name']])\n",
    "\n",
    "        train_pool_re = Pool(\n",
    "            X_train_re[[\"text\", \"area_TEIS\"]],\n",
    "            Y_train_re,\n",
    "            text_features=[\"text\"],\n",
    "            cat_features=[\"area_TEIS\"]\n",
    "        )\n",
    "        valid_pool_re = Pool(\n",
    "            X_test_re[[\"text\", \"area_TEIS\"]],\n",
    "            Y_test_re,\n",
    "            text_features=[\"text\"],\n",
    "            cat_features=[\"area_TEIS\"]\n",
    "        )\n",
    "\n",
    "        catboost_params = catboost_params\n",
    "                    \n",
    "        # Model Training\n",
    "        model_re = CatBoostClassifier(**catboost_params)\n",
    "        # start_time = time.time()  # Start timing\n",
    "        model_re.fit(train_pool_re, eval_set=valid_pool_re)\n",
    "        # training_time = time.time() - start_time  # End timing\n",
    "\n",
    "        # Save the retrain performances\n",
    "        predictions = model_re.predict(X_test_re_Test[[\"text\", \"area_TEIS\"]])\n",
    "        accuracy = accuracy_score(Y_test_re_Test, predictions)\n",
    "        report = classification_report(Y_test_re_Test, predictions, digits=6, output_dict=True)\n",
    "        classification_df = classification_report_to_df(report, Y_test_re_Test, predictions)\n",
    "\n",
    "        print(classification_df)\n",
    "\n",
    "        iteration_repo = f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_{iteration}/\"\n",
    "        os.makedirs(iteration_repo, exist_ok=True)\n",
    "        if classification_df.loc[topic_dict[\"T13\"], 'Diff Balanced Accuracy'] >= 0:\n",
    "            classification_df.to_csv(f\"{iteration_repo}/Bch_Itr_{iteration}.csv\", index=True)\n",
    "            classification_df.to_pickle(f\"{iteration_repo}/Bch_Itr_{iteration}.pkl\")\n",
    "\n",
    "            X_train_re.to_pickle(f\"{iteration_repo}/X_train_re_itr_{iteration}.pkl\")\n",
    "            Y_train_re.to_pickle(f\"{iteration_repo}/Y_train_re_itr_{iteration}.pkl\")\n",
    "        else:\n",
    "            iteration_noimprove_repo = f\"D:/Step_2_Pathway/Paper_GPU1h_Improve_T13_TBA/C_Run_{run}/Iteration_{iteration}/Itr_No_Improve\"\n",
    "            os.makedirs(iteration_noimprove_repo, exist_ok=True)\n",
    "            classification_df.to_csv(f\"{iteration_noimprove_repo}/Bch_Itr_{iteration}.csv\", index=True)\n",
    "            classification_df.to_pickle(f\"{iteration_noimprove_repo}/Bch_Itr_{iteration}.pkl\")\n",
    "\n",
    "            X_train_re.to_pickle(f\"{iteration_noimprove_repo}/X_train_re_itr_{iteration}.pkl\")\n",
    "            Y_train_re.to_pickle(f\"{iteration_noimprove_repo}/Y_train_re_itr_{iteration}.pkl\")\n",
    "\n",
    "            bch_class_df.to_csv(f\"{iteration_repo}/Bch_Itr_{iteration}.csv\", index=True)\n",
    "            bch_class_df.to_pickle(f\"{iteration_repo}/Bch_Itr_{iteration}.pkl\")\n",
    "\n",
    "            prev_itr_X_train_re.to_pickle(f\"{iteration_repo}/X_train_re_itr_{iteration}.pkl\")\n",
    "            prev_itr_Y_train_re.to_pickle(f\"{iteration_repo}/Y_train_re_itr_{iteration}.pkl\")\n",
    "        \n",
    "        # final_pareto_observer(history_pareto_segments_list)\n",
    "        # level_stats_df_allPCA = pd.concat([level_stats_df_allPCA, level_stats_df], axis=0, ignore_index=True)\n",
    "        # level_stats_df_allPCA.to_csv(f'{HSW_results_path}/{level_stats_df_name}.csv', index=False)\n",
    "        # level_stats_df_allPCA.to_pickle(f'{HSW_results_path}/{level_stats_df_name}.pkl')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
