{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Importations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import warnings\n",
    "import openml\n",
    "from sklearn.datasets import make_classification\n",
    "from sklearn.feature_selection import mutual_info_classif\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from mlxtend.evaluate import bias_variance_decomp\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from pymfe.mfe import MFE\n",
    "\n",
    "\n",
    "%matplotlib inline\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Utility Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to estimate informative and redundant features from real-world dataset\n",
    "def estimate_informative_redundant_features(real_data):\n",
    "    # Separate features and target\n",
    "    features = real_data.drop(columns='target')\n",
    "    target = real_data['target']\n",
    "    \n",
    "    # Estimate mutual information to find informative features\n",
    "    mi_scores = mutual_info_classif(features, target, random_state=42)\n",
    "    \n",
    "    # Set a threshold for informative features (e.g., top 50% based on mutual information score)\n",
    "    threshold = np.median(mi_scores)\n",
    "    informative_features = [i for i, score in enumerate(mi_scores) if score > threshold]\n",
    "    n_informative = len(informative_features)\n",
    "    \n",
    "    # Estimate redundant features by calculating correlation between features\n",
    "    correlation_matrix = features.corr().abs()\n",
    "    \n",
    "    # Set a threshold for redundant features (e.g., correlation > 0.8)\n",
    "    redundant_features = set()\n",
    "    for i in range(correlation_matrix.shape[0]):\n",
    "        for j in range(i + 1, correlation_matrix.shape[1]):\n",
    "            if correlation_matrix.iloc[i, j] > 0.8:\n",
    "                redundant_features.add(j)\n",
    "    n_redundant = len(redundant_features)\n",
    "    \n",
    "    return n_informative, n_redundant\n",
    "\n",
    "# Function to generate synthetic classification data similar to the real-world dataset\n",
    "def generate_synthetic_data(real_data, n_samples=None, class_sep=0.1, random_state=None):\n",
    "    # Get the number of informative and redundant features\n",
    "    n_informative, n_redundant = estimate_informative_redundant_features(real_data)\n",
    "\n",
    "    # Separate features from the target\n",
    "    features = real_data.drop(columns='target')\n",
    "    target = real_data['target']\n",
    "    \n",
    "    # Total number of features in the real dataset\n",
    "    n_features = features.shape[1]\n",
    "    \n",
    "    # Number of unique classes in the target\n",
    "    n_classes = len(np.unique(target))\n",
    "    \n",
    "    # If number of samples is not provided, use the same number of rows as the original dataset\n",
    "    if n_samples is None:\n",
    "        n_samples = len(real_data)\n",
    "    \n",
    "    # Generate synthetic dataset using make_classification\n",
    "    X, y = make_classification(\n",
    "        n_samples=n_samples,\n",
    "        n_features=n_features,\n",
    "        n_informative=n_informative,\n",
    "        n_redundant=n_redundant,\n",
    "        n_classes=n_classes,\n",
    "        class_sep=class_sep,\n",
    "        weights=[0.4,0.6],\n",
    "        random_state=random_state\n",
    "    )\n",
    "    \n",
    "    # Create a DataFrame for the synthetic features\n",
    "    synthetic_features = pd.DataFrame(X, columns=features.columns)\n",
    "\n",
    "    # Add the synthetic target column\n",
    "    synthetic_features['target'] = y\n",
    "\n",
    "    return synthetic_features\n",
    "\n",
    "# Function to visualize the synthetic data (for 2D visualization)\n",
    "def visualize_2d_data(X, y, title=\"Synthetic Binary Classification Data\"):\n",
    "    plt.figure(figsize=(8, 6))\n",
    "    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolor='k', s=20)\n",
    "    plt.title(title)\n",
    "    plt.xlabel(\"Feature 1\")\n",
    "    plt.ylabel(\"Feature 2\")\n",
    "    plt.grid(True)\n",
    "    plt.show()\n",
    "    \n",
    "    plt.savefig(\"Synthetic Data\"+\".png\" ,bbox_inches = 'tight',pad_inches = 0.5, format='png')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Synthetic Dataset Generation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the real-world dataset \n",
    "task = openml.tasks.get_task(31)  # download the OpenML task\n",
    "features, targets = task.get_X_and_y()  # get the data\n",
    "\n",
    "df_data = pd.DataFrame(features)\n",
    "df_data['target'] = pd.DataFrame(targets)\n",
    "\n",
    "# Check if the target column is categorical and encode it if necessary\n",
    "if df_data['target'].dtype == 'object':\n",
    "    le = LabelEncoder()\n",
    "    df_data['target'] = le.fit_transform(df_data['target'])\n",
    "\n",
    "# Generate synthetic dataset\n",
    "n_samples = 700  # You can set this to any number you prefer\n",
    "random_state = 42  # Seed for reproducibility\n",
    "class_sep = 0.1 # Class separation parameter\n",
    "\n",
    "synthetic_data = generate_synthetic_data(df_data, n_samples, class_sep, random_state)\n",
    "\n",
    "s_features = synthetic_data.drop(columns='target')\n",
    "s_target = synthetic_data['target']\n",
    "\n",
    "# Compute N1 complexity of the generated dataset\n",
    "mfe = MFE(features=[\"n1\"], groups=[\"complexity\"], summary=[\"mean\"], random_state=42)\n",
    "mfe.fit(s_features.values, s_target.values)\n",
    "ft = mfe.extract()\n",
    "n = \" \".join(map(str, ft[1]))\n",
    "n = n.strip(\"[' ']\" )\n",
    "print(n)\n",
    "\n",
    "# Visualize the dataset\n",
    "visualize_2d_data(s_features.values, s_target.values, title=f\"Synthetic Binary Classification with Class Separation\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Hyperparameter Manipulation (Min_Samples_Leaf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bias = []\n",
    "var = []\n",
    "\n",
    "# Load the real-world dataset \n",
    "task = openml.tasks.get_task(31)  # download the OpenML task\n",
    "features, targets = task.get_X_and_y()  # get the data\n",
    "\n",
    "df_data = pd.DataFrame(features)\n",
    "df_data['target'] = pd.DataFrame(targets)\n",
    "\n",
    "# Check if the target column is categorical and encode it if necessary\n",
    "if df_data['target'].dtype == 'object':\n",
    "    le = LabelEncoder()\n",
    "    df_data['target'] = le.fit_transform(df_data['target'])\n",
    "\n",
    "# Generate synthetic dataset\n",
    "n_samples = 700  # You can set this to any number you prefer\n",
    "random_state = 42  # Seed for reproducibility\n",
    "class_sep = 0.1 # Class separation parameter\n",
    "\n",
    "synthetic_data = generate_synthetic_data(df_data, n_samples, class_sep, random_state)\n",
    "\n",
    "s_features = synthetic_data.drop(columns='target')\n",
    "s_target = synthetic_data['target']\n",
    "\n",
    "# Split data into training and test set\n",
    "X_train, X_test, y_train, y_test = train_test_split(s_features, s_target, test_size=0.3, random_state=123, stratify=s_target)\n",
    "\n",
    "# Parameter space definition\n",
    "min_samples_leaf = np.arange(1, 11)\n",
    "\n",
    "# Tune Model and Bias-Variance decomposition\n",
    "for param in min_samples_leaf:\n",
    "    \n",
    "    model = RandomForestClassifier(min_samples_leaf=param, random_state=42, n_jobs=-1)\n",
    "    \n",
    "    avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(model, X_train.values, y_train.values, X_test.values, y_test.values, loss='0-1_loss',random_seed=123)\n",
    "    bias.append(avg_bias)\n",
    "    var.append(avg_var)\n",
    "\n",
    "# Plot Tuning Result\n",
    "plt.figure(figsize=(8,5))\n",
    "plt.plot(min_samples_leaf, bias, color='blue', label='Bias')\n",
    "plt.plot(min_samples_leaf, var, color='red', label='Variance')\n",
    "\n",
    "plt.legend(loc='best')\n",
    "plt.ylim(0.0, 0.45)\n",
    "plt.gca().invert_xaxis()\n",
    "plt.xlabel('Min_samples_leaf', fontweight='bold', fontsize='large')\n",
    "plt.ylabel('Error', fontweight='bold', fontsize='large');\n",
    "plt.xticks(fontweight='bold', fontsize='large')    \n",
    "plt.yticks(fontweight='bold', fontsize='large')\n",
    "\n",
    "plt.savefig(\"Min_samples_leaf_Curve\"+\".pdf\" ,bbox_inches = 'tight',pad_inches = 0.5, format='pdf')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Meta-feature Manipulation (N1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n1=[]\n",
    "bias = []\n",
    "var = []\n",
    "\n",
    "# Load the real-world dataset (replace 'your_dataset.csv' with the actual file)\n",
    "task = openml.tasks.get_task(31)  # download the OpenML task\n",
    "features, targets = task.get_X_and_y()  # get the data\n",
    "\n",
    "df_data = pd.DataFrame(features)\n",
    "df_data['target'] = pd.DataFrame(targets)\n",
    "\n",
    "# Check if the target column is categorical and encode it if necessary\n",
    "if df_data['target'].dtype == 'object':\n",
    "    le = LabelEncoder()\n",
    "    df_data['target'] = le.fit_transform(df_data['target'])\n",
    "\n",
    "# Generate synthetic dataset\n",
    "n_samples = 700  # You can set this to any number you prefer\n",
    "random_state = 42  # Seed for reproducibility\n",
    "class_sep = [0.1, 0.5, 0.9, 1.3, 1.7] # Class separation parameter \n",
    "\n",
    "\n",
    "# Set up RF model\n",
    "model = RandomForestClassifier(min_samples_leaf=10, random_state=42, n_jobs=-1)\n",
    "\n",
    "# Overlap Reduction and Bias-Variance Decomposition\n",
    "for cs in class_sep:\n",
    "    # Generate synthetic binary classification data\n",
    "    synthetic_data = generate_synthetic_data(df_data, n_samples, cs, random_state)\n",
    "    s_features = synthetic_data.drop(columns='target')\n",
    "    s_target = synthetic_data['target']\n",
    "    \n",
    "    # Split data into training and test set\n",
    "    X_train, X_test, y_train, y_test = train_test_split(s_features, s_target, test_size=0.3, random_state=123, shuffle=True, stratify=s_target)\n",
    "    \n",
    "    avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(model, X_train.values, y_train.values, X_test.values, y_test.values, loss='0-1_loss',random_seed=123)\n",
    "    bias.append(avg_bias)\n",
    "    var.append(avg_var)\n",
    "\n",
    "    # compute n1 complexity\n",
    "    mfe = MFE(features=[\"n1\"], groups=[\"complexity\"], summary=[\"mean\"], random_state=42)\n",
    "    mfe.fit(X_train.values, y_train.values)\n",
    "    ft = mfe.extract()\n",
    "    n = \" \".join(map(str, ft[1]))\n",
    "    n = n.strip(\"[' ']\" )\n",
    "    n1.append(n)\n",
    "\n",
    "n1= list(map(float, n1))\n",
    "n1 = [ '%.2f' % elem for elem in n1 ]\n",
    "\n",
    "# Plor Result\n",
    "plt.figure(figsize=(8,5))\n",
    "plt.plot(n1, bias, color='blue', label='Bias')\n",
    "plt.plot(n1, var, color='red', label='Variance')\n",
    "plt.legend(loc='best')\n",
    "plt.ylim(0, 0.45)\n",
    "plt.xlabel('N1', fontweight='bold', fontsize='large')\n",
    "plt.ylabel('Error', fontweight='bold', fontsize='large');\n",
    "plt.xticks(fontweight='bold', fontsize='large')    \n",
    "plt.yticks(fontweight='bold', fontsize='large')\n",
    "\n",
    "plt.savefig(\"N1_Curve\"+\".pdf\" ,bbox_inches = 'tight',pad_inches = 0.5, format='pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
