{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Overview\n",
    "\n",
    "This notebook provides the code to recreate the simulations included in the \"Fair Feature Importance Scores for Interpreting Tree-based Methods and Surrogates\" manuscript. The simulate_data function simulates data as specified by the input parameters. We provide examples on a decision tree classifier, boosting classifier, and a random forest classifier on a linear scenario. We also provide examples on a decision tree regressor, boosting regressor, and a random forest regressor on a linear scenario. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.metrics import accuracy_score\n",
    "import math\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from scipy.special import expit\n",
    "from scipy.stats import pearsonr\n",
    "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor\n",
    "from FairFIS import fis_tree, fis_forest, fis_boosting\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor, RandomForestRegressor\n",
    "from FairFIS import util\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def simulate_data(nrow, ncol, alphas, betas, p, seed, model_type, classification = True):\n",
    "\n",
    "    \"\"\"\n",
    "    Simulates biased data.\n",
    "    Inputs:\n",
    "      * nrow - the number of rows in the output dataset.\n",
    "      * ncol - the number of non-protected covariates X_i in the output dataset.\n",
    "      * alphas - numpy array of scalars, where alpha_i controls the effect of\n",
    "        protected attribute z on covariate X_i through the relationship\n",
    "        X_i ~ Normal(alpha_i*z, 1).\n",
    "      * betas - numpy array of scalars, where beta_i controls the effect of\n",
    "        covariate X_i on the binary outcome y; can be thought of as the regression\n",
    "        coefficients in the logistic regression scenario.\n",
    "      * p - the probability of success (aka value 1) for our protected attribute;\n",
    "        should be less than 0.5.\n",
    "      * model_type - Options are 'linear', 'nonlinear', 'nonlinear_int'. These correspond to \n",
    "        a linear model, a nonlinear additive model, and a nonlinear additive model with \n",
    "        pairwise interactions.\n",
    "      * classification = True (default) will return a classification dataset, \n",
    "        while classifcation = False will return a regression dataset.\n",
    "    Returns:\n",
    "      * X matrix, y vector, z vector\n",
    "    \"\"\"\n",
    "\n",
    "    # the following two assertions are to check that the input parameters\n",
    "    # make sense - we should have one value of alpha and beta for each covariate\n",
    "    assert ncol == len(alphas)\n",
    "    assert ncol == len(betas)\n",
    "    random.seed(seed)\n",
    "    betas = np.reshape(betas, (len(betas),1))\n",
    "    z = np.random.binomial(1, p, size=nrow)\n",
    "    X = np.zeros((nrow, ncol))\n",
    "\n",
    "    if model_type == 'linear':\n",
    "        for i in range(ncol):\n",
    "            X[:,i] =  alphas[i]*z + np.random.normal(loc = 0 , scale = 0.1, size = nrow)\n",
    "        y_prob = X@betas + np.random.normal(loc = 0 , scale = 0.01, size = (nrow,1))\n",
    "        \n",
    "    if model_type == 'nonlinear':\n",
    "        for i in range(ncol):\n",
    "            X[:,i] =  alphas[i]*z + np.random.normal(loc = 0 , scale = 0.1, size = nrow)\n",
    "        y_prob = np.sin(X)@betas + np.random.normal(loc = 0 , scale = 0.01, size = (nrow,1))\n",
    "        \n",
    "    if model_type == 'nonlinear_int':\n",
    "        for i in range(ncol):\n",
    "            X[:,i] =  alphas[i]*z + np.random.normal(loc = 0 , scale = 0.1, size = nrow)\n",
    "            X[:,0] = X[:,0]*X[:,1]\n",
    "            X[:,3] = X[:,3]*X[:,4]\n",
    "            X[:,6] = X[:,6]*X[:,7]\n",
    "        y_prob = np.sin(X)@betas + np.random.normal(loc = 0 , scale = 0.01, size = (nrow,1))\n",
    "    \n",
    "    if classification == True:\n",
    "        y_prob = expit(y_prob)\n",
    "        y = (y_prob >= .5).astype(int)\n",
    "    else:\n",
    "        y = y_prob\n",
    "  \n",
    "    return X,y,z"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Classification Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Classification Simulation Setup\n",
    "iterations = 10\n",
    "seed_vec = list(range(iterations))\n",
    "nrow = 1000\n",
    "ncol = 12\n",
    "a = 2\n",
    "b = 1\n",
    "alphas = np.array([a,a,a,a,a,a,0,0,0,0,0,0])\n",
    "betas = np.array([b,b,b,0,0,0,b,b,b,0,0,0])\n",
    "p = 0.2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Decision Tree Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dp_mat = np.empty([iterations,ncol])\n",
    "eo_mat = np.empty([iterations,ncol])\n",
    "acc_mat = np.empty([iterations,ncol])\n",
    "\n",
    "for i, s in enumerate(seed_vec):\n",
    "  X,y,z = simulate_data(nrow,ncol,alphas, betas,p,s,'linear')\n",
    "  clf = DecisionTreeClassifier(max_depth = 8)\n",
    "  clf.fit(X,y)\n",
    "  \n",
    "  #FairFIS\n",
    "  f_forest = fis_tree(clf,X,y,z,0, triangle = False)\n",
    "  f_forest.calculate_fairness_importance_score()\n",
    "  eo_mat[i] = f_forest._fairness_importance_score_eqop_root\n",
    "  dp_mat[i] = f_forest._fairness_importance_score_dp_root\n",
    "  acc_mat[i] = clf.feature_importances_\n",
    "\n",
    "eo_mean = np.mean(eo_mat, axis = 0)\n",
    "dp_mean = np.mean(dp_mat, axis = 0)\n",
    "acc_mean = np.mean(acc_mat, axis = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Display results for decision tree classifier\n",
    "features = [1,2,3,4,5,6,7,8,9,10,11,12]\n",
    "data = {'Feature': features,\n",
    "    'DP': dp_mean,\n",
    "         'EO': eo_mean,\n",
    "             'ACC': acc_mean}\n",
    "  \n",
    "# Create DataFrame\n",
    "tree_scores = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Boosting Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dp_mat = np.empty([iterations,ncol])\n",
    "eo_mat = np.empty([iterations,ncol])\n",
    "acc_mat = np.empty([iterations,ncol])\n",
    "\n",
    "for i, s in enumerate(seed_vec):\n",
    "  X,y,z = simulate_data(nrow,ncol,alphas, betas,p,s,'linear')\n",
    "  clf = GradientBoostingClassifier(n_estimators=100, max_depth=5, max_features='auto')\n",
    "  clf.fit(X,y)\n",
    "  \n",
    "  #FairFIS\n",
    "  f_forest = fis_boosting(clf,X,y,z,0, triangle = False)\n",
    "  f_forest.calculate_fairness_importance_score()\n",
    "  \n",
    "  dp_mat[i] = f_forest._fairness_importance_score_dp_root\n",
    "  eo_mat[i] = f_forest._fairness_importance_score_eqop_root\n",
    "  acc_mat[i] = f_forest.fitted_clf.feature_importances_\n",
    "\n",
    "\n",
    "\n",
    "dp_mean = np.mean(dp_mat, axis = 0)\n",
    "eo_mean = np.mean(eo_mat, axis = 0)\n",
    "acc_mean = np.mean(acc_mat, axis = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Display results for boosting classifier\n",
    "features = [1,2,3,4,5,6,7,8,9,10,11,12]\n",
    "data = {'Feature': features,\n",
    "    'DP': dp_mean,\n",
    "         'EO': eo_mean,\n",
    "             'ACC': acc_mean}\n",
    "  \n",
    "# Create DataFrame\n",
    "boosting_scores = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Random Forest Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dp_mat = np.empty([iterations,ncol])\n",
    "eo_mat = np.empty([iterations,ncol])\n",
    "acc_mat = np.empty([iterations,ncol])\n",
    "\n",
    "for i, s in enumerate(seed_vec):\n",
    "  X,y,z = simulate_data(nrow,ncol,alphas, betas,p,s,'linear')\n",
    "  clf = RandomForestClassifier(n_estimators=100,n_jobs=-2)\n",
    "  clf.fit(X,y)\n",
    "  \n",
    "  #Our approach\n",
    "  f_forest = fis_forest(clf,X,y,z,0, triangle = False)\n",
    "  f_forest.calculate_fairness_importance_score()\n",
    "  \n",
    "  dp_mat[i] = f_forest._fairness_importance_score_dp_root\n",
    "  eo_mat[i] = f_forest._fairness_importance_score_eqop_root\n",
    "  acc_mat[i] = f_forest.fitted_clf.feature_importances_\n",
    "\n",
    "\n",
    "eo_mean = np.mean(eo_mat, axis = 0)\n",
    "dp_mean = np.mean(dp_mat, axis = 0)\n",
    "acc_mean = np.mean(acc_mat, axis = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = [1,2,3,4,5,6,7,8,9,10,11,12]\n",
    "data = {'Feature': features,\n",
    "    'DP': dp_mean,\n",
    "         'EO': eo_mean,\n",
    "             'ACC': acc_mean}\n",
    "  \n",
    "# Create DataFrame\n",
    "rf_scores = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Regression Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Regression Simulation Setup\n",
    "iterations = 10\n",
    "seed_vec = list(range(iterations))\n",
    "nrow = 1000\n",
    "ncol = 12\n",
    "a = 0.5\n",
    "b = 5\n",
    "alphas = np.array([a,a,a,a,a,a,0,0,0,0,0,0])\n",
    "betas = np.array([b,b,b,0,0,0,b,b,b,0,0,0])\n",
    "p = 0.2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Decision Tree Regressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dp_mat = np.empty([iterations,ncol])\n",
    "eo_mat = np.empty([iterations,ncol])\n",
    "acc_mat = np.empty([iterations,ncol])\n",
    "\n",
    "for i, s in enumerate(seed_vec):\n",
    "  X,y,z = simulate_data(nrow,ncol,alphas, betas,p,s,'linear', classification = False)\n",
    "  clf = DecisionTreeRegressor(max_depth = 8)\n",
    "  clf.fit(X,y)\n",
    "  \n",
    "  #Our approach\n",
    "  f_forest = fis_tree(clf,X,y,z,0, regression=True)\n",
    "  f_forest.calculate_fairness_importance_score()\n",
    "  \n",
    "  dp_mat[i] = f_forest._fairness_importance_score_dp_root\n",
    "  acc_mat[i] = clf.feature_importances_\n",
    "\n",
    "\n",
    "\n",
    "dp_mean = np.mean(dp_mat, axis = 0)\n",
    "acc_mean = np.mean(acc_mat, axis = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = [1,2,3,4,5,6,7,8,9,10,11,12]\n",
    "data = {'Feature': features,\n",
    "    'DP': dp_mean,\n",
    "         'EO': eo_mean,\n",
    "             'ACC': acc_mean}\n",
    "  \n",
    "# Create DataFrame\n",
    "tree_scores = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Boosting Regressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dp_mat = np.empty([iterations,ncol])\n",
    "eo_mat = np.empty([iterations,ncol])\n",
    "acc_mat = np.empty([iterations,ncol])\n",
    "\n",
    "for i, s in enumerate(seed_vec):\n",
    "  X,y,z = simulate_data(nrow,ncol,alphas, betas,p,s,'linear', classification = False)\n",
    "  clf = GradientBoostingRegressor(n_estimators=100, max_depth=5, max_features='auto')\n",
    "  clf.fit(X,y)\n",
    "  \n",
    "  #Our approach\n",
    "  f_forest = fis_boosting(clf,X,y,z,0, regression=True)\n",
    "  f_forest.calculate_fairness_importance_score()\n",
    "  \n",
    "  dp_mat[i] = f_forest._fairness_importance_score_dp_root\n",
    "  acc_mat[i] = f_forest.fitted_clf.feature_importances_\n",
    "\n",
    "\n",
    "\n",
    "dp_mean = np.mean(dp_mat, axis = 0)\n",
    "acc_mean = np.mean(acc_mat, axis = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Display results for boosting regressor\n",
    "features = [1,2,3,4,5,6,7,8,9,10,11,12]\n",
    "data = {'Feature': features,\n",
    "    'DP': dp_mean,\n",
    "         'EO': eo_mean,\n",
    "             'ACC': acc_mean}\n",
    "  \n",
    "# Create DataFrame\n",
    "boosting_scores = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Random Forest Regressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dp_mat = np.empty([iterations,ncol])\n",
    "eo_mat = np.empty([iterations,ncol])\n",
    "acc_mat = np.empty([iterations,ncol])\n",
    "\n",
    "for i, s in enumerate(seed_vec):\n",
    "  X,y,z = simulate_data(nrow,ncol,alphas, betas,p,s,'linear', classification = False)\n",
    "  clf = RandomForestRegressor(n_estimators=100,n_jobs=-2)\n",
    "  clf.fit(X,y)\n",
    "  \n",
    "  #Our approach\n",
    "  f_forest = fis_forest(clf,X,y,z,0, regression=True)\n",
    "  f_forest.calculate_fairness_importance_score()\n",
    "  \n",
    "  dp_mat[i] = f_forest._fairness_importance_score_dp_root\n",
    "  acc_mat[i] = f_forest.fitted_clf.feature_importances_\n",
    "\n",
    "\n",
    "\n",
    "dp_mean = np.mean(dp_mat, axis = 0)\n",
    "acc_mean = np.mean(acc_mat, axis = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Display results for regression regressor\n",
    "features = [1,2,3,4,5,6,7,8,9,10,11,12]\n",
    "data = {'Feature': features,\n",
    "    'DP': dp_mean,\n",
    "         'EO': eo_mean,\n",
    "             'ACC': acc_mean}\n",
    "  \n",
    "# Create DataFrame\n",
    "regression_scores = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
