{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import statsmodels.api as sm\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('../dataset_final.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert decision to binary classes accept, reject\n",
    "decision_to_binary = {'Accept (Poster)': 1, 'Accept (Oral)': 1, 'Accept (Talk)': 1, 'Accept (Spotlight)': 1, 'Invite to Workshop Track': 0, 'Withdrawn': 0, 'Reject': 0}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fit logistic regression model with either first or last author genders\n",
    "def logistic_regression(author):\n",
    "    if author:\n",
    "        index=0\n",
    "    else:\n",
    "        index=-1\n",
    "        \n",
    "    scores = []\n",
    "    gender_indicator = []\n",
    "    \n",
    "    y = []\n",
    "    for ratings, decision, year, gender in zip(data['ratings'], data['decisions'], data['year'], data['genders']):\n",
    "        if year != 2020:\n",
    "            continue\n",
    "        \n",
    "        # get decision\n",
    "        binary_decision = decision_to_binary.get(decision)\n",
    "        \n",
    "        # get genders, omit unlablled authors\n",
    "        genders = gender.split(';')\n",
    "        if genders[index] == '-1' or genders[index] == 'u':\n",
    "            continue\n",
    "        \n",
    "        # compute gender indicator\n",
    "        if genders[index] == 'm':\n",
    "            gender_indicator.append(1)\n",
    "        else:\n",
    "            gender_indicator.append(0)\n",
    "        \n",
    "        # get mean reviewer score\n",
    "        rates = ratings.split(';')\n",
    "        rates = [int(x) for x in rates]\n",
    "        rating_avg = np.average(rates)\n",
    "\n",
    "        scores.append(rating_avg)\n",
    "        y.append(binary_decision)\n",
    "\n",
    "\n",
    "    X = pd.DataFrame()\n",
    "    X['mean reviewer score'] = scores\n",
    "    X['gender indicator'] = gender_indicator\n",
    "    X['constant'] = [1] * len(scores)\n",
    "    y = np.array(y)\n",
    "    \n",
    "    # fit logistic regression model\n",
    "    logreg = sm.Logit(y, X)\n",
    "    result = logreg.fit()\n",
    "    summary = result.summary()\n",
    "    \n",
    "    # compute test accuracy\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n",
    "    logreg = LogisticRegression(solver='newton-cg', fit_intercept = True, penalty='none')\n",
    "    result = logreg.fit(X_train, y_train)\n",
    "    \n",
    "    # return summary and accuracy\n",
    "    y_pred = result.predict(X_test)\n",
    "    return summary, logreg.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "First author\n",
      "\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.207938\n",
      "         Iterations 9\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   No. Observations:                 2419\n",
      "Model:                          Logit   Df Residuals:                     2416\n",
      "Method:                           MLE   Df Model:                            2\n",
      "Date:                Fri, 02 Oct 2020   Pseudo R-squ.:                  0.6412\n",
      "Time:                        02:13:51   Log-Likelihood:                -503.00\n",
      "converged:                       True   LL-Null:                       -1401.7\n",
      "Covariance Type:            nonrobust   LLR p-value:                     0.000\n",
      "=======================================================================================\n",
      "                          coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------------\n",
      "mean reviewer score     2.3956      0.109     21.887      0.000       2.181       2.610\n",
      "gender indicator       -0.1045      0.266     -0.392      0.695      -0.627       0.418\n",
      "constant              -13.0241      0.633    -20.590      0.000     -14.264     -11.784\n",
      "=======================================================================================\n",
      "Accuracy:  0.9118457300275482\n",
      "Last author\n",
      "\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.210489\n",
      "         Iterations 9\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   No. Observations:                 2497\n",
      "Model:                          Logit   Df Residuals:                     2494\n",
      "Method:                           MLE   Df Model:                            2\n",
      "Date:                Fri, 02 Oct 2020   Pseudo R-squ.:                  0.6388\n",
      "Time:                        02:13:51   Log-Likelihood:                -525.59\n",
      "converged:                       True   LL-Null:                       -1455.2\n",
      "Covariance Type:            nonrobust   LLR p-value:                     0.000\n",
      "=======================================================================================\n",
      "                          coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------------\n",
      "mean reviewer score     2.3813      0.107     22.349      0.000       2.172       2.590\n",
      "gender indicator        0.0520      0.272      0.191      0.849      -0.482       0.586\n",
      "constant              -13.0441      0.627    -20.800      0.000     -14.273     -11.815\n",
      "=======================================================================================\n",
      "Accuracy:  0.9266666666666666\n"
     ]
    }
   ],
   "source": [
    "print('First author\\n')\n",
    "summary_first, score_first = logistic_regression(True)\n",
    "print(summary_first)\n",
    "print('Accuracy: ', score_first)\n",
    "print('Last author\\n')\n",
    "summary_last, score_last = logistic_regression(False)\n",
    "print(summary_last)\n",
    "print('Accuracy: ', score_last)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
