{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Importations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import warnings\n",
    "\n",
    "from statistics import median, mean\n",
    "from fanova.fanova import *\n",
    "\n",
    "%matplotlib inline\n",
    "warnings.filterwarnings(\"ignore\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load Performance Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('RF_Perf_Data.csv')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Utility Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def indx(n):\n",
    "    if n == \"x_000\":n = 0\n",
    "    elif n == \"x_001\":n = 1\n",
    "    elif n == \"x_002\":n = 2\n",
    "    elif n == \"x_003\":n = 3            \n",
    "    elif n == \"x_004\":n = 4            \n",
    "    elif n == \"x_005\":n = 5              \n",
    "    elif n == \"x_006\":n = 6  \n",
    "    elif n == \"x_007\":n = 7              \n",
    "    elif n == \"x_008\":n = 8              \n",
    "    elif n == \"x_009\":n = 9                         \n",
    "    elif n == \"x_010\":n = 10              \n",
    "    return n\n",
    "\n",
    "def bar_plots(df_imp, fig_title):\n",
    "    \n",
    "    plt.figure(figsize=(9,5))\n",
    "    order_imp = df_imp.groupby([\"meta-features\"])[\"importance\"].mean().sort_values(ascending=False).index\n",
    "    p=sns.barplot(data=df_imp, x=df_imp[\"importance\"], y=df_imp[\"meta-features\"], order=order_imp, palette=\"rocket\")\n",
    "\n",
    "    p.set_title(' ')\n",
    "    p.set(xlabel=' ', ylabel=' ')\n",
    "    plt.xticks(fontweight='bold', fontsize='xx-large')\n",
    "    plt.yticks(fontweight='bold', fontsize='xx-large')\n",
    "    plt.xlim(xmin=0, xmax=0.9) \n",
    "    ax=plt.gcf()\n",
    "    \n",
    "    ax.savefig(fig_title+\"plot\"+\".pdf\" ,bbox_inches = 'tight',pad_inches = 1, format='pdf')\n",
    "    \n",
    "    plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Influence Analysis (fANOVA)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Bias"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_bias = pd.DataFrame()\n",
    "result_param = []\n",
    "result_imp = []\n",
    "importance_one = {}\n",
    "one_wise_dict = {} \n",
    "\n",
    "\n",
    "# Load features and target\n",
    "X = df.drop(['Unnamed: 0','data_id', 'avg_bias', 'avgVariance', 'avg_expected_loss','L1'], axis=1)\n",
    "Y = df['avg_bias']\n",
    "\n",
    "X_columns = pd.DataFrame(list(X.columns), columns=['column_names'])\n",
    "\n",
    "# fANOVA Analysis\n",
    "f = fANOVA(X, Y, seed=42, n_trees=32, bootstrapping=True)\n",
    "\n",
    "dim= np.arange(len(X_columns))\n",
    "\n",
    "\n",
    "# getting important individual marginals\n",
    "for m in dim:\n",
    "    importance = f.quantify_importance((m,))\n",
    "    importance_one.update(importance)  \n",
    "\n",
    "#sorted_importance\n",
    "sorted_importance = sorted(list(importance_one.keys()), key=lambda t: importance_one[t]['individual importance'], reverse=True)\n",
    "\n",
    "\n",
    "count = np.arange(len(X_columns))\n",
    "for k in count:\n",
    "    tuples = sorted_importance[k]\n",
    "    indexx= \" \".join(map(str, tuples))\n",
    "    row_index =int(indexx)\n",
    "\n",
    "    column_name_list = X_columns.iloc[[row_index]].values\n",
    "    column_name = \" \".join(map(str, column_name_list))\n",
    "    column_name = column_name.strip(\"[' ']\" )\n",
    "    column_name = column_name\n",
    "\n",
    "    ind_importance = importance_one[sorted_importance[k]]['individual importance']\n",
    "    one_wise_dict.update({column_name:ind_importance})\n",
    "\n",
    "    result_param.append(column_name)\n",
    "    result_imp.append(ind_importance)\n",
    "\n",
    "result_bias['meta-features'] = pd.Series(result_param)\n",
    "result_bias['importance'] = pd.Series(result_imp)\n",
    "\n",
    "\n",
    "bar_plots(result_bias, 'RF_FANOVA_Bias')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_bias"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Variance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_var = pd.DataFrame()\n",
    "result_param = []\n",
    "result_imp = []\n",
    "importance_one = {}\n",
    "one_wise_dict = {} \n",
    "\n",
    "\n",
    "# Load features and target\n",
    "X = df.drop(['Unnamed: 0', 'data_id', 'avg_bias', 'avgVariance', 'avg_expected_loss','L1'], axis=1)\n",
    "Y = df['avgVariance']\n",
    "\n",
    "X_columns = pd.DataFrame(list(X.columns), columns=['column_names'])\n",
    "\n",
    "# fANOVA Analysis\n",
    "f = fANOVA(X, Y, seed=42, n_trees=32, bootstrapping=True)\n",
    "\n",
    "dim= np.arange(len(X_columns))\n",
    "\n",
    "\n",
    "# getting important individual marginals\n",
    "for m in dim:\n",
    "    importance = f.quantify_importance((m,))\n",
    "    importance_one.update(importance)  \n",
    "\n",
    "#sorted_importance\n",
    "sorted_importance = sorted(list(importance_one.keys()), key=lambda t: importance_one[t]['individual importance'], reverse=True)\n",
    "\n",
    "count = np.arange(len(X_columns))\n",
    "for k in count:\n",
    "    tuples = sorted_importance[k]\n",
    "    indexx= \" \".join(map(str, tuples))\n",
    "    row_index =int(indexx)\n",
    "\n",
    "    column_name_list = X_columns.iloc[[row_index]].values\n",
    "    column_name = \" \".join(map(str, column_name_list))\n",
    "    column_name = column_name.strip(\"[' ']\" )\n",
    "    column_name = column_name\n",
    "\n",
    "    ind_importance = importance_one[sorted_importance[k]]['individual importance']\n",
    "    one_wise_dict.update({column_name:ind_importance})\n",
    "\n",
    "    result_param.append(column_name)\n",
    "    result_imp.append(ind_importance)\n",
    "\n",
    "result_var['meta-features'] = pd.Series(result_param)\n",
    "result_var['importance'] = pd.Series(result_imp)\n",
    "\n",
    "bar_plots(result_var, 'RF_FANOVA_Var')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bar_plots(result_var, 'RF_FANOVA_Var')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sum(result_var['importance'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
