{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9332611f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import all utility functions\n",
    "from utility import *\n",
    "from matplotlib.ticker import LogLocator"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "34d716b6",
   "metadata": {},
   "source": [
    "## Income Data: Dynamics of at and qt and Delta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52a58a16",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"income_embeddings.csv\")\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "411e3d44",
   "metadata": {},
   "source": [
    "### Read Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6eaa841b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_a_n = df[(df['y'] == 0) & (df['z'] == 0)]\n",
    "df_a_p = df[(df['y'] == 1) & (df['z'] == 0)] \n",
    "df_b_n = df[(df['y'] == 0) & (df['z'] == 1)]\n",
    "df_b_p = df[(df['y'] == 1) & (df['z'] == 1)] \n",
    "\n",
    "alpha_a = len(df_a_p)/(len(df_a_p) + len(df_a_n))\n",
    "alpha_b = len(df_b_p)/(len(df_b_p) + len(df_b_n))\n",
    "print(alpha_a, alpha_b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "94ddd3e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "lx1, rx1, lx2, rx2 = df['x1'].min(), df['x1'].max(), df['x2'].min(), df['x2'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ada2c28a",
   "metadata": {},
   "outputs": [],
   "source": [
    "lx1, rx1, lx2, rx2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "94c6aeba",
   "metadata": {},
   "source": [
    "### Fit Beta Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "fa5b4389",
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_beta(x_range, a, b, mu=0, sigma=1, cdf=False, **kwargs):\n",
    "    '''\n",
    "    Plots the f distribution function for a given x range, a and b\n",
    "    If mu and sigma are not provided, standard beta is plotted\n",
    "    If cdf=True cumulative distribution is plotted\n",
    "    Passes any keyword arguments to matplotlib plot function\n",
    "    '''\n",
    "    x = x_range\n",
    "    if cdf:\n",
    "        y = beta.cdf(x, a, b, mu, sigma)\n",
    "    else:\n",
    "        y = beta.pdf(x, a, b, mu, sigma)\n",
    "    plt.plot(x, y, **kwargs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "611b6278",
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot an overall distribution\n",
    "df_a = pd.concat([df_a_n, df_a_p])\n",
    "all_params_1 = beta.fit(df_a['x1'],floc = 0.3, fscale = 0.3)\n",
    "all_params_2 = beta.fit(df_a['x2'],floc = 0.55, fscale = 0.35)\n",
    "x = np.linspace(0.3, 0.6, 1000)\n",
    "plot_beta(x, all_params_1[0], all_params_1[1], 0.3, 0.3, cdf=True, color='black', label=r\"$F_{X_1}$\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f18efc0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# save the beta distribution parameters\n",
    "params_1 = {'a':{'p':[],'n':[]},'b':{'p':[],'n':[]}}\n",
    "params_2 = {'a':{'p':[],'n':[]},'b':{'p':[],'n':[]}}\n",
    "\n",
    "# fit parameters\n",
    "params_1['a'] ['n'] = beta.fit(df_a_n['x1'],floc = 0.3, fscale = 0.3)\n",
    "params_1['a'] ['p'] = beta.fit(df_a_p['x1'],floc = 0.3, fscale = 0.3)\n",
    "params_1['b'] ['n'] = beta.fit(df_b_n['x1'],floc = 0.3, fscale = 0.3)\n",
    "params_1['b'] ['p'] = beta.fit(df_b_p['x1'],floc = 0.3, fscale = 0.3)\n",
    "params_2['a'] ['n'] = beta.fit(df_a_n['x2'],floc = 0.55, fscale = 0.35)\n",
    "params_2['a'] ['p'] = beta.fit(df_a_p['x2'],floc = 0.55, fscale = 0.35)\n",
    "params_2['b'] ['n'] = beta.fit(df_b_n['x2'],floc = 0.55, fscale = 0.35)\n",
    "params_2['b'] ['p'] = beta.fit(df_b_p['x2'],floc = 0.55, fscale = 0.35)\n",
    "\n",
    "print(params_1, params_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38279256",
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot the distribution\n",
    "x = np.linspace(0, 1, 500)\n",
    "fig = plt.figure(figsize=(12, 2.5))\n",
    "\n",
    "# group a and X1\n",
    "plt.subplot(1,4,1)\n",
    "plt.ylim(0.1, 16.6)\n",
    "plt.xlim(0, 1)\n",
    "plot_beta(x, params_1['a'] ['n'][0], params_1['a'] ['n'][1], 0.3, 0.3, color='blue', lw=2, ls='-')\n",
    "plot_beta(x, params_1['a'] ['p'][0], params_1['a'] ['p'][1], 0.3, 0.3, color='red', lw=2, ls='-')\n",
    "plt.hist(df_a_n['x1'], density=True, color = 'blue', bins=10, label='unqualified',alpha=0.5)\n",
    "plt.hist(df_a_p['x1'], density=True, bins=100, color = 'red', label='qualified',alpha=0.5)\n",
    "plt.title(r\"Group $i$\")\n",
    "plt.xlabel(r'$X_1$')\n",
    "plt.legend()\n",
    "\n",
    "\n",
    "# group a and X2\n",
    "plt.subplot(1,4,2)\n",
    "plt.ylim(0.1, 16.6)\n",
    "plt.xlim(0, 1)\n",
    "plot_beta(x, params_2['a'] ['n'][0], params_2['a'] ['n'][1], 0.55, 0.35, color='blue', lw=2, ls='-')\n",
    "plot_beta(x, params_2['a'] ['p'][0], params_2['a'] ['p'][1], 0.55, 0.35, color='red', lw=2, ls='-')\n",
    "plt.hist(df_a_n['x2'], density=True, color = 'blue', bins=10, label='unqualified',alpha=0.5)\n",
    "plt.hist(df_a_p['x2'], density=True, bins=100, color = 'red', label='qualified',alpha=0.5)\n",
    "plt.title(r\"Group $i$\")\n",
    "plt.xlabel(r'$X_2$')\n",
    "plt.legend()\n",
    "\n",
    "\n",
    "# group b and X1\n",
    "plt.subplot(1,4,3)\n",
    "plt.ylim(0.1, 16.6)\n",
    "plt.xlim(0, 1)\n",
    "plot_beta(x, params_1['b'] ['n'][0], params_1['b'] ['n'][1], 0.3, 0.3, color='blue', lw=2, ls='-')\n",
    "plot_beta(x, params_1['b'] ['p'][0], params_1['b'] ['p'][1], 0.3, 0.3, color='red', lw=2, ls='-')\n",
    "plt.hist(df_b_n['x1'], density=True, color = 'blue', bins=10, label='unqualified',alpha=0.5)\n",
    "plt.hist(df_b_p['x1'], density=True, bins=100, color = 'red', label='qualified',alpha=0.5)\n",
    "plt.title(r\"Group $j$\")\n",
    "plt.xlabel(r'$X_1$')\n",
    "plt.legend()\n",
    "\n",
    "\n",
    "# group b and X2\n",
    "plt.subplot(1,4,4)\n",
    "plt.ylim(0.1, 16.6)\n",
    "plt.xlim(0, 1)\n",
    "plot_beta(x, params_2['b'] ['n'][0], params_2['b'] ['n'][1], 0.55, 0.35, color='blue', lw=2, ls='-')\n",
    "plot_beta(x, params_2['b'] ['p'][0], params_2['b'] ['p'][1], 0.55, 0.35, color='red', lw=2, ls='-')\n",
    "plt.hist(df_b_n['x2'], density=True, color = 'blue', bins=10, label='unqualified',alpha=0.5)\n",
    "plt.hist(df_b_p['x2'], density=True, bins=100, color = 'red', label='qualified',alpha=0.5)\n",
    "plt.title(r\"Group $j$\")\n",
    "plt.xlabel(r'$X_2$')\n",
    "plt.legend()\n",
    "plt.tight_layout()\n",
    "# plt.subplots_adjust(left=0.1,\n",
    "#                     bottom=0.1,\n",
    "#                     right=0.9,\n",
    "#                     top=0.9,\n",
    "#                     wspace=0.2,\n",
    "#                     hspace=0.4)\n",
    "plt.show()\n",
    "fig.savefig('plots_new/feature_dist_real_income.pdf')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "04f2902c",
   "metadata": {},
   "source": [
    "### Verify the monotone likelihood "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1320bd23",
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_monotone(x_range, a0, b0, a1, b1, mu1=0, sigma1=1, **kwargs):\n",
    "    '''\n",
    "    Plots the f distribution function for a given x range, a and b\n",
    "    If mu and sigma are not provided, standard beta is plotted\n",
    "    If cdf=True cumulative distribution is plotted\n",
    "    Passes any keyword arguments to matplotlib plot function\n",
    "    '''\n",
    "    x = x_range\n",
    "    y0 = beta.pdf(x, a0, b0, mu1, sigma1)\n",
    "    y1 = beta.pdf(x, a1, b1, mu1, sigma1)\n",
    "    ratio_10 = y1/y0\n",
    "    plt.plot(x, ratio_10, **kwargs)\n",
    "    plt.yscale('log')\n",
    "\n",
    "\n",
    "fig = plt.figure(figsize=(6,2.5))   \n",
    "ax = fig.add_subplot(121)\n",
    "x = np.linspace(0, 1, 500)\n",
    "plt.xlim(0, 1)\n",
    "plt.ylim(1e-4, 10**15)\n",
    "plot_monotone(x,params_1['a'] ['n'][0],params_1['a'] ['n'][1],params_1['a'] ['p'][0],params_1['a'] ['p'][1], 0.3, 0.3, lw=1.5, ls='-',label = 'Group a')\n",
    "plot_monotone(x,params_1['b'] ['n'][0],params_1['b'] ['n'][1],params_1['b'] ['p'][0],params_1['b'] ['p'][1], 0.3, 0.3, lw=1.5, ls='-',label = 'Group b')\n",
    "ax.yaxis.set_major_locator(LogLocator(base=10**5))\n",
    "ax.set_xlabel(r'$X_1$')\n",
    "plt.legend()\n",
    "ax = fig.add_subplot(122)\n",
    "plt.xlim(0, 1)\n",
    "plt.ylim(1e-4, 10**15)\n",
    "plot_monotone(x,params_2['a'] ['n'][0],params_2['a'] ['n'][1],params_2['a'] ['p'][0],params_2['a'] ['p'][1], 0.55, 0.35, lw=1.5, ls='-',label = 'Group a')\n",
    "plot_monotone(x,params_2['b'] ['n'][0],params_2['b'] ['n'][1],params_2['b'] ['p'][0],params_2['b'] ['p'][1], 0.55, 0.35, lw=1.5, ls='-',label = 'Group b')\n",
    "ax.yaxis.set_major_locator(LogLocator(base=10**5))\n",
    "ax.set_xlabel(r'$X_2$')\n",
    "plt.legend()\n",
    "plt.minorticks_off()\n",
    "plt.tight_layout()\n",
    "fig.savefig('plots_new/monotone.pdf')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3ccefba6",
   "metadata": {},
   "source": [
    "### Experiment Begins\n",
    "\n",
    "\n",
    "- Note: slight quantitative differences may exist because of deprecated versions of sklearn implement logistic classifier, but the qualitative results should remain same."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "72337589",
   "metadata": {},
   "outputs": [],
   "source": [
    "n=10\n",
    "T = 15\n",
    "Q = np.array([[5,0], [0,5]])\n",
    "N = 2000\n",
    "alphas = {'a':alpha_a, 'b':alpha_b}\n",
    "tp=4"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fa38e7a4",
   "metadata": {},
   "source": [
    "Group $i$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db97bc3f",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# ratio = 0.1\n",
    "np.random.seed(2)\n",
    "ratio = 0.1\n",
    "r = 0.1\n",
    "bias = 'up'\n",
    "mag = 0.1\n",
    "des = f\"income_setting_ratio{r}_bias{bias}_mag{mag}\"\n",
    "At, Qt, At_sd, Qt_sd = simulation(Q,N,n,T,alphas,bias,mag,tp,r,params_1=params_1,params_2=params_2,sd=True)\n",
    "plot_save_single(At, Qt, des, False)\n",
    "plot_save_single_err(At, Qt, At_sd, Qt_sd, des, False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e15767c8",
   "metadata": {},
   "source": [
    "Group $j$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7f6ee3f",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# ratio = 0.1\n",
    "np.random.seed(2)\n",
    "r = 0.1\n",
    "bias = 'down'\n",
    "mag = 0.1\n",
    "des = f\"income_setting_ratio{r}_bias{bias}_mag{mag}\"\n",
    "At, Qt, At_sd, Qt_sd = simulation(Q,N,n,T,alphas,bias,mag,tp,r,params_1=params_1,params_2=params_2,group='b',sd=True)\n",
    "plot_save_single(At, Qt, des, False)\n",
    "plot_save_single_err(At, Qt, At_sd, Qt_sd, des, False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
