{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8647dff",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from models import cmre, nmre, ndee, ocsvm\n",
    "from plots import plot_appendix, plot_main_paper\n",
    "import copy\n",
    "import pickle\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5f769f86",
   "metadata": {},
   "outputs": [],
   "source": [
    "RESULTS_DIR = ''"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a9aff9fa",
   "metadata": {},
   "source": [
    "# Simulation 1 Experiments (Appendix F.2.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "11e72f47",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_sgc_datasets(mr,\n",
    "                          sa_normal,\n",
    "                          sa_untrustworthy,\n",
    "                          te\n",
    "                          ):\n",
    "\n",
    "    # Load dataset\n",
    "    df = pd.read_excel('./datasets/default_of_credit_card_clients.xls', skiprows=1)\n",
    "    df = df.sample(frac=1).reset_index(drop=True)\n",
    "    \n",
    "\n",
    "    # Drop rows that are not needed\n",
    "    df = df[['SEX', 'EDUCATION', 'MARRIAGE', 'AGE']]\n",
    "\n",
    "    # Convert sex to binary variable\n",
    "    df['SEX'] = df['SEX'] - 1\n",
    "\n",
    "    # Convert marriage to binary variable\n",
    "    df['MARRIAGE'] = np.where(df['MARRIAGE'] > 1, 1, 0)\n",
    "\n",
    "    # Convert education to binary variable\n",
    "    df['EDUCATION'] = np.where(df['EDUCATION'] > 2, 1, 0)\n",
    "\n",
    "    # Min-max scale the data\n",
    "    df['AGE'] = (df['AGE'] - df['AGE'].min()) / (df['AGE'].max() - df['AGE'].min())\n",
    "    \n",
    "    # Use marriage as selection bias\n",
    "    agent_prob = 0.05  + (1-df['SEX'])*0.3 + (1-df['MARRIAGE'])*0.3\n",
    "    df['AGENT'] = np.random.binomial(1, agent_prob, len(df))\n",
    "    \n",
    "    # Generate employment variable\n",
    "    employment_prob = 0.05 \\\n",
    "                    + df['EDUCATION']*0.05 \\\n",
    "                    + df['MARRIAGE']*df['SEX']*0.3 \\\n",
    "                    + np.square(df['AGE'])*0.1 \\\n",
    "                    + df['AGENT'] * sa_untrustworthy + (1-df['AGENT'])*sa_normal\n",
    "    df['EMPLOYMENT'] = np.random.binomial(1, employment_prob, len(df))\n",
    "\n",
    "    # Generate default variable\n",
    "    default_prob = 0.05 \\\n",
    "                    + df['EDUCATION']*0.05 \\\n",
    "                    + df['MARRIAGE']*df['SEX']*0.3 \\\n",
    "                    + np.square(df['AGE'])*0.1 \\\n",
    "                    + df['EMPLOYMENT'] * te\n",
    "    df['DEFAULT'] = np.random.binomial(1, default_prob, len(df))\n",
    "\n",
    "    # Strategically misreport the dataset (misreported employment status)\n",
    "    prob_agent_employment = len(df[(df['AGENT'] == 1) & (df['EMPLOYMENT'] == 1)]) / len(df[df['AGENT'] == 1])\n",
    "    df['EMPLOYMENT'] = df['EMPLOYMENT'] + df['AGENT']*(1-df['EMPLOYMENT']) * np.random.binomial(1, ((employment_prob / (1-mr)) - employment_prob) / (1 - employment_prob),  len(df))\n",
    "\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ef121ca7",
   "metadata": {},
   "source": [
    "# Results for semi-synthetic loan experiments"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "df49c467",
   "metadata": {},
   "source": [
    "### Estimate MR given different causal effects"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c3d17c71",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_sims = 100\n",
    "causal_effects = [0.1, 0.2, 0.3, 0.4, 0.5]\n",
    "# Dataframes to keep track of results\n",
    "cmre_loan_results_ce_df_normal = pd.DataFrame(columns=['sim_num', 'ce', 'mr'])\n",
    "nmre_loan_results_ce_df_normal = pd.DataFrame(columns=['sim_num', 'ce', 'mr'])\n",
    "ndee_all_loan_results_ce_df_normal = pd.DataFrame(columns=['sim_num', 'ce', 'mr'])\n",
    "ndee_no_s_loan_results_ce_df_normal = pd.DataFrame(columns=['sim_num', 'ce', 'mr'])\n",
    "ocsvm_loan_results_ce_df_normal = pd.DataFrame(columns=['sim_num', 'ce', 'mr'])\n",
    "\n",
    "# Perform a few simulations for each method\n",
    "for sim in range(num_sims):\n",
    "    # Set the random seed\n",
    "    np.random.seed(sim)\n",
    "    \n",
    "    for causal_effect in causal_effects:\n",
    "\n",
    "        # Generate dataset for simulation\n",
    "        df = generate_sgc_datasets(0.2, 0.0, 0.3, causal_effect)\n",
    "\n",
    "        normal_dataset = df[df['AGENT'] == 0]\n",
    "        strategic_dataset = df[df['AGENT'] == 1]\n",
    "        \n",
    "        # Get misreporting rates and keep track of results\n",
    "        mr = cmre('EMPLOYMENT', 'DEFAULT', ['EDUCATION', 'SEX', 'AGE', 'MARRIAGE'], normal_dataset, strategic_dataset)\n",
    "        cmre_loan_results_ce_df_normal.loc[len(cmre_loan_results_ce_df_normal)] = [sim, causal_effect, mr]\n",
    "\n",
    "        mr = nmre('EMPLOYMENT', 'DEFAULT', normal_dataset, strategic_dataset)\n",
    "        nmre_loan_results_ce_df_normal.loc[len(nmre_loan_results_ce_df_normal)] = [sim, causal_effect, mr]\n",
    "\n",
    "        mr = ndee('AGENT', 'EMPLOYMENT', ['EDUCATION', 'SEX', 'MARRIAGE', 'AGE'], df)\n",
    "        ndee_all_loan_results_ce_df_normal.loc[len(ndee_all_loan_results_ce_df_normal)] = [sim, causal_effect, mr]\n",
    "\n",
    "        mr = ndee('AGENT', 'EMPLOYMENT', ['EDUCATION', 'AGE'], df)\n",
    "        ndee_no_s_loan_results_ce_df_normal.loc[len(ndee_no_s_loan_results_ce_df_normal)] = [sim, causal_effect, mr]\n",
    "\n",
    "        mr = ocsvm(normal_dataset, strategic_dataset)\n",
    "        ocsvm_loan_results_ce_df_normal.loc[len(ocsvm_loan_results_ce_df_normal)] = [sim, causal_effect, mr]\n",
    "\n",
    "# Get mean and std for each causal effect\n",
    "cmre_loan_results_ce_df_normal = cmre_loan_results_ce_df_normal.groupby('ce')['mr'].agg(['mean', 'std']).reset_index()\n",
    "nmre_loan_results_ce_df_normal = nmre_loan_results_ce_df_normal.groupby('ce')['mr'].agg(['mean', 'std']).reset_index()\n",
    "ndee_all_loan_results_ce_df_normal = ndee_all_loan_results_ce_df_normal.groupby('ce')['mr'].agg(['mean', 'std']).reset_index()\n",
    "ndee_no_s_loan_results_ce_df_normal = ndee_no_s_loan_results_ce_df_normal.groupby('ce')['mr'].agg(['mean', 'std']).reset_index()\n",
    "ocsvm_loan_results_ce_df_normal = ocsvm_loan_results_ce_df_normal.groupby('ce')['mr'].agg(['mean', 'std']).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "daf34b1a",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Save info\n",
    "loan_list_ce_normal = [cmre_loan_results_ce_df_normal,\n",
    "                       nmre_loan_results_ce_df_normal,\n",
    "                       ndee_all_loan_results_ce_df_normal,\n",
    "                       ndee_no_s_loan_results_ce_df_normal,\n",
    "                       ocsvm_loan_results_ce_df_normal]\n",
    "with open(f'{RESULTS_DIR}/loan_list_ce_normal.pkl', 'wb') as f:\n",
    "    pickle.dump(loan_list_ce_normal, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d6068ccf",
   "metadata": {},
   "source": [
    "### Estimate MR given different Misrepoting Rates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6b7ab613",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_sims = 100\n",
    "mr_probs = [0.0, 0.05, 0.10, 0.15, 0.20]\n",
    "\n",
    "# Dataframes to keep track of results\n",
    "cmre_loan_results_mr_df_normal = pd.DataFrame(columns=['sim_num', 'true_mr', 'mr'])\n",
    "nmre_loan_results_mr_df_normal = pd.DataFrame(columns=['sim_num', 'true_mr', 'mr'])\n",
    "ndee_all_loan_results_mr_df_normal = pd.DataFrame(columns=['sim_num', 'true_mr', 'mr'])\n",
    "ndee_no_s_loan_results_mr_df_normal = pd.DataFrame(columns=['sim_num', 'true_mr', 'mr'])\n",
    "ocsvm_loan_results_mr_df_normal = pd.DataFrame(columns=['sim_num', 'true_mr', 'mr'])\n",
    "\n",
    "# Perform a few simulations for each method\n",
    "for sim in range(num_sims):\n",
    "    # Set the random seed\n",
    "    np.random.seed(sim)\n",
    "    \n",
    "    for true_mr in mr_probs:\n",
    "\n",
    "        # Generate dataset for simulation\n",
    "        df = generate_sgc_datasets(true_mr, 0.0, 0.3, 0.4)\n",
    "\n",
    "        normal_dataset = df[df['AGENT'] == 0]\n",
    "        strategic_dataset = df[df['AGENT'] == 1]\n",
    "        \n",
    "        # Get misreporting rates and keep track of results\n",
    "        mr = cmre('EMPLOYMENT', 'DEFAULT', ['EDUCATION', 'SEX', 'MARRIAGE', 'AGE'], normal_dataset, strategic_dataset)\n",
    "        cmre_loan_results_mr_df_normal.loc[len(cmre_loan_results_mr_df_normal)] = [sim, true_mr, mr]\n",
    "\n",
    "        mr = nmre('EMPLOYMENT', 'DEFAULT', normal_dataset, strategic_dataset)\n",
    "        nmre_loan_results_mr_df_normal.loc[len(nmre_loan_results_mr_df_normal)] = [sim, true_mr, mr]\n",
    "\n",
    "        mr = ndee('AGENT', 'EMPLOYMENT', ['EDUCATION', 'SEX', 'MARRIAGE', 'AGE'], df)\n",
    "        ndee_all_loan_results_mr_df_normal.loc[len(ndee_all_loan_results_mr_df_normal)] = [sim, true_mr, mr]\n",
    "\n",
    "        mr = ndee('AGENT', 'EMPLOYMENT', ['EDUCATION', 'AGE'], df)\n",
    "        ndee_no_s_loan_results_mr_df_normal.loc[len(ndee_no_s_loan_results_mr_df_normal)] = [sim, true_mr, mr]\n",
    "\n",
    "        mr = ocsvm(normal_dataset, strategic_dataset)\n",
    "        ocsvm_loan_results_mr_df_normal.loc[len(ocsvm_loan_results_mr_df_normal)] = [sim, true_mr, mr]\n",
    "        \n",
    "\n",
    "# Get mean and std for each causal effect\n",
    "cmre_loan_results_mr_df_normal = cmre_loan_results_mr_df_normal.groupby('true_mr')['mr'].agg(['mean', 'std']).reset_index()\n",
    "nmre_loan_results_mr_df_normal = nmre_loan_results_mr_df_normal.groupby('true_mr')['mr'].agg(['mean', 'std']).reset_index()\n",
    "ndee_all_loan_results_mr_df_normal = ndee_all_loan_results_mr_df_normal.groupby('true_mr')['mr'].agg(['mean', 'std']).reset_index()\n",
    "ndee_no_s_loan_results_mr_df_normal = ndee_no_s_loan_results_mr_df_normal.groupby('true_mr')['mr'].agg(['mean', 'std']).reset_index()\n",
    "ocsvm_loan_results_mr_df_normal = ocsvm_loan_results_mr_df_normal.groupby('true_mr')['mr'].agg(['mean', 'std']).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "06b6e5bc",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Save info\n",
    "loan_list_mr_normal = [cmre_loan_results_mr_df_normal,\n",
    "                       nmre_loan_results_mr_df_normal,\n",
    "                       ndee_all_loan_results_mr_df_normal,\n",
    "                       ndee_no_s_loan_results_mr_df_normal,\n",
    "                       ocsvm_loan_results_mr_df_normal]\n",
    "with open(f'{RESULTS_DIR}/loan_list_mr_normal.pkl', 'wb') as f:\n",
    "    pickle.dump(loan_list_mr_normal, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "951b5c9d",
   "metadata": {},
   "source": [
    "### Estimate MR given different genuine adaptation rates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "2361d991",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_sims = 100\n",
    "sa_probs = [0.0, 0.05, 0.10, 0.15, 0.2, 0.25, 0.3]\n",
    "\n",
    "# Dataframes to keep track of results\n",
    "cmre_loan_results_sa_df_normal = pd.DataFrame(columns=['sim_num', 'sa', 'mr'])\n",
    "nmre_loan_results_sa_df_normal = pd.DataFrame(columns=['sim_num', 'sa', 'mr'])\n",
    "ndee_all_loan_results_sa_df_normal = pd.DataFrame(columns=['sim_num', 'sa', 'mr'])\n",
    "ndee_no_s_loan_results_sa_df_normal = pd.DataFrame(columns=['sim_num', 'sa', 'mr'])\n",
    "ocsvm_loan_results_sa_df_normal = pd.DataFrame(columns=['sim_num', 'sa', 'mr'])\n",
    "\n",
    "# Perform a few simulations for each method\n",
    "for sim in range(num_sims):\n",
    "    # Set the random seed\n",
    "    np.random.seed(sim)\n",
    "    \n",
    "    for sa in sa_probs:\n",
    "\n",
    "        # Generate dataset for simulation\n",
    "        df = generate_sgc_datasets(0.2, 0.0, sa, 0.4)\n",
    "\n",
    "        normal_dataset = df[df['AGENT'] == 0]\n",
    "        strategic_dataset = df[df['AGENT'] == 1]\n",
    "        \n",
    "        # Get misreporting rates and keep track of results\n",
    "        mr = cmre('EMPLOYMENT', 'DEFAULT', ['EDUCATION', 'SEX', 'MARRIAGE', 'AGE'], normal_dataset, strategic_dataset)\n",
    "        cmre_loan_results_sa_df_normal.loc[len(cmre_loan_results_sa_df_normal)] = [sim, sa, mr]\n",
    "\n",
    "        mr = nmre('EMPLOYMENT', 'DEFAULT', normal_dataset, strategic_dataset)\n",
    "        nmre_loan_results_sa_df_normal.loc[len(nmre_loan_results_sa_df_normal)] = [sim, sa, mr]\n",
    "\n",
    "        mr = ndee('AGENT', 'EMPLOYMENT', ['EDUCATION', 'SEX', 'MARRIAGE', 'AGE'], df)\n",
    "        ndee_all_loan_results_sa_df_normal.loc[len(ndee_all_loan_results_sa_df_normal)] = [sim, sa, mr]\n",
    "\n",
    "        mr = ndee('AGENT', 'EMPLOYMENT', ['EDUCATION', 'AGE'], df)\n",
    "        ndee_no_s_loan_results_sa_df_normal.loc[len(ndee_no_s_loan_results_sa_df_normal)] = [sim, sa, mr]\n",
    "\n",
    "        mr = ocsvm(normal_dataset, strategic_dataset)\n",
    "        ocsvm_loan_results_sa_df_normal.loc[len(ocsvm_loan_results_sa_df_normal)] = [sim, sa, mr]\n",
    "\n",
    "# Get mean and std for each causal effect\n",
    "cmre_loan_results_sa_df_normal = cmre_loan_results_sa_df_normal.groupby('sa')['mr'].agg(['mean', 'std']).reset_index()\n",
    "nmre_loan_results_sa_df_normal = nmre_loan_results_sa_df_normal.groupby('sa')['mr'].agg(['mean', 'std']).reset_index()\n",
    "ndee_all_loan_results_sa_df_normal = ndee_all_loan_results_sa_df_normal.groupby('sa')['mr'].agg(['mean', 'std']).reset_index()\n",
    "ndee_no_s_loan_results_sa_df_normal = ndee_no_s_loan_results_sa_df_normal.groupby('sa')['mr'].agg(['mean', 'std']).reset_index()\n",
    "ocsvm_loan_results_sa_df_normal = ocsvm_loan_results_sa_df_normal.groupby('sa')['mr'].agg(['mean', 'std']).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "1fd85bbd",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Save info\n",
    "loan_list_sa_normal = [cmre_loan_results_sa_df_normal,\n",
    "                       nmre_loan_results_sa_df_normal,\n",
    "                       ndee_all_loan_results_sa_df_normal,\n",
    "                       ndee_no_s_loan_results_sa_df_normal,\n",
    "                       ocsvm_loan_results_sa_df_normal]\n",
    "with open(f'{RESULTS_DIR}/loan_list_sa_normal.pkl', 'wb') as f:\n",
    "    pickle.dump(loan_list_sa_normal, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ceadf2f5",
   "metadata": {},
   "source": [
    "# Create plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "93c7f716",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load lists\n",
    "with open(f'{RESULTS_DIR}/loan_list_ce_normal.pkl', 'rb') as f:\n",
    "    loan_list_ce_normal = pickle.load(f)\n",
    "with open(f'{RESULTS_DIR}/loan_list_mr_normal.pkl', 'rb') as f:\n",
    "    loan_list_mr_normal = pickle.load(f)\n",
    "with open(f'{RESULTS_DIR}/loan_list_sa_normal.pkl', 'rb') as f:\n",
    "    loan_list_sa_normal = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc560c49",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create and show plot\n",
    "synth_list = [loan_list_sa_normal, loan_list_ce_normal, loan_list_mr_normal]\n",
    "synth_mr_list = [0.2, 0.2, 0.2]\n",
    "label_list = ['CMRE (Ours)', 'NMRE', 'NDEE (All)', 'NDEE (no S)', 'OCSVM']\n",
    "x_var_list = ['sa', 'ce', 'true_mr']\n",
    "x_label_list = [r'\\textbf{Causal Effect of $A$ on $X^{*}$}', r'\\textbf{Causal Effect of $X^{*}$ on $Y$}', r'\\textbf{Misreporting Rate}']\n",
    "y_label = r'\\textbf{Estimated MR}'\n",
    "\n",
    "plt = plot_appendix(synth_list,\n",
    "                label_list,\n",
    "                synth_mr_list,\n",
    "                x_var_list,\n",
    "                x_label_list,\n",
    "                y_label)\n",
    "\n",
    "\n",
    "# plt.show()\n",
    "plt.savefig(f'{RESULTS_DIR}/simulation_1_appendix_plot.pdf', dpi=600, bbox_inches='tight')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db052685",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create and show plot\n",
    "indices = [0,1,2,4]\n",
    "loan_list_sa_normal_small = [loan_list_sa_normal[i] for i in indices]\n",
    "loan_list_ce_normal_small = [loan_list_ce_normal[i] for i in indices]\n",
    "loan_list_mr_normal_small = [loan_list_mr_normal[i] for i in indices]\n",
    "synth_list = [loan_list_sa_normal_small, loan_list_ce_normal_small, loan_list_mr_normal_small]\n",
    "synth_mr_list = [0.2, 0.2, 0.2]\n",
    "label_list = [r'\\textbf{CMRE (Ours)}', r'\\textbf{NMRE}', r'\\textbf{NDEE}', r'\\textbf{OC-SVM}']\n",
    "x_var_list = ['sa', 'ce', 'true_mr']\n",
    "x_label_list = [r'\\textbf{Causal Effect of $A$ on $X^{*}$}', r'\\textbf{Causal Effect of $X^{*}$ on $Y$}', r'\\textbf{Misreporting Rate}']\n",
    "y_label = r'\\textbf{Estimated MR}'\n",
    "\n",
    "plt = plot_main_paper(synth_list,\n",
    "                label_list,\n",
    "                synth_mr_list,\n",
    "                x_var_list,\n",
    "                x_label_list,\n",
    "                y_label)\n",
    "\n",
    "plt.savefig(f'{RESULTS_DIR}/simulation_1_main_plot.pdf', dpi=600, bbox_inches='tight')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90e71075",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
