{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import statsmodels.api as sm\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('../dataset_final.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_to_binary = {'Accept (Poster)': 1, 'Accept (Oral)': 1, 'Accept (Talk)': 1, 'Accept (Spotlight)': 1, 'Invite to Workshop Track': 0, 'Withdrawn': 0, 'Reject': 0}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_10_indicator = []\n",
    "counts = [0] * 2\n",
    "scores = []\n",
    "y = []\n",
    "for rankings, ratings, decision, year in zip(data['csranking'], data['ratings'], data['decisions'], data['year']):\n",
    "    if pd.notnull(rankings) and pd.notnull(ratings) and year == 2020:\n",
    "        # get mean reviewer score\n",
    "        rates = ratings.split(';')\n",
    "        rates = [int(x) for x in rates]\n",
    "        rating_avg = np.average(rates)\n",
    "        \n",
    "        # get distinct rankings\n",
    "        ranks = rankings.split(';')\n",
    "        ranks = [int(x) for x in ranks]\n",
    "        ranks_set = set(ranks)\n",
    "        ranks_set.discard(-1)\n",
    "        \n",
    "        # get decision\n",
    "        binary_decision = decision_to_binary.get(decision)\n",
    "        \n",
    "        # for each distinct rank\n",
    "        for rank in ranks_set:\n",
    "            scores.append(rating_avg)\n",
    "            y.append(binary_decision)\n",
    "            \n",
    "            # compute top 10 indicator\n",
    "            if 1 <= rank <= 10:\n",
    "                top_10_indicator.append(1)\n",
    "                counts[0] += 1\n",
    "            else:\n",
    "                top_10_indicator.append(0)\n",
    "                counts[1] += 1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = pd.DataFrame()\n",
    "X['mean reviewer score'] = scores\n",
    "X['top ten school?'] = top_10_indicator\n",
    "X['constant'] = [1] * len(scores)\n",
    "y = np.array(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(3139, 3)\n",
      "(3139,)\n"
     ]
    }
   ],
   "source": [
    "print(X.shape)\n",
    "print(y.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.212936\n",
      "         Iterations 9\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   No. Observations:                 3139\n",
      "Model:                          Logit   Df Residuals:                     3136\n",
      "Method:                           MLE   Df Model:                            2\n",
      "Date:                Fri, 02 Oct 2020   Pseudo R-squ.:                  0.6429\n",
      "Time:                        02:13:37   Log-Likelihood:                -668.41\n",
      "converged:                       True   LL-Null:                       -1871.6\n",
      "Covariance Type:            nonrobust   LLR p-value:                     0.000\n",
      "=======================================================================================\n",
      "                          coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------------\n",
      "mean reviewer score     2.4350      0.097     25.118      0.000       2.245       2.625\n",
      "top ten school?         0.3536      0.161      2.194      0.028       0.038       0.670\n",
      "constant              -13.2707      0.526    -25.212      0.000     -14.302     -12.239\n",
      "=======================================================================================\n",
      "[732, 2407] 3139\n"
     ]
    }
   ],
   "source": [
    "# fit logistic regression model\n",
    "logreg = sm.Logit(y, X)\n",
    "result = logreg.fit()\n",
    "print(result.summary())\n",
    "# print counts of top 10 and not top 10 schools\n",
    "print(counts, sum(counts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy of logistic regression classifier on test set: 0.92\n"
     ]
    }
   ],
   "source": [
    "# compute accuracy on test set\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n",
    "logreg = LogisticRegression(solver='newton-cg', fit_intercept = True, penalty='none')\n",
    "result = logreg.fit(X_train, y_train)\n",
    "\n",
    "y_pred = result.predict(X_test)\n",
    "\n",
    "print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "counts = [0] * 10\n",
    "chi_1 = []\n",
    "chi_2 = []\n",
    "chi_3 = []\n",
    "chi_4 = []\n",
    "chi_5 = []\n",
    "chi_6 = []\n",
    "chi_7 = []\n",
    "chi_8 = []\n",
    "chi_10 = []\n",
    "\n",
    "scores = []\n",
    "y = []\n",
    "for rankings, ratings, decision, year in zip(data['csranking'], data['ratings'], data['decisions'], data['year']):\n",
    "    if pd.notnull(rankings) and pd.notnull(ratings) and year == 2020:\n",
    "        # get mean reviewer score\n",
    "        rates = ratings.split(';')\n",
    "        rates = [int(x) for x in rates]\n",
    "        rating_avg = np.average(rates)\n",
    "        \n",
    "        # get distinct rankings\n",
    "        ranks = rankings.split(';')\n",
    "        ranks = [int(x) for x in ranks]\n",
    "        ranks_set = set(ranks)\n",
    "        ranks_set.discard(-1)\n",
    "\n",
    "        # get decision\n",
    "        binary_decision = decision_to_binary.get(decision)\n",
    "        \n",
    "        # for each distinct rank\n",
    "        for rank in ranks_set:\n",
    "            scores.append(rating_avg)\n",
    "            y.append(binary_decision)\n",
    "            # compute indicators for each of the top 10\n",
    "            if rank == 1:\n",
    "                chi_1.append(1)\n",
    "                chi_2.append(0)\n",
    "                chi_3.append(0)\n",
    "                chi_4.append(0)\n",
    "                chi_5.append(0)\n",
    "                chi_6.append(0)\n",
    "                chi_7.append(0)\n",
    "                chi_8.append(0)\n",
    "                chi_10.append(0)\n",
    "                counts[0] += 1\n",
    "            elif rank == 2:\n",
    "                chi_1.append(0)\n",
    "                chi_2.append(1)\n",
    "                chi_3.append(0)\n",
    "                chi_4.append(0)\n",
    "                chi_5.append(0)\n",
    "                chi_6.append(0)\n",
    "                chi_7.append(0)\n",
    "                chi_8.append(0)\n",
    "                chi_10.append(0)\n",
    "                counts[1] += 1\n",
    "            elif rank == 3:\n",
    "                chi_1.append(0)\n",
    "                chi_2.append(0)\n",
    "                chi_3.append(1)\n",
    "                chi_4.append(0)\n",
    "                chi_5.append(0)\n",
    "                chi_6.append(0)\n",
    "                chi_7.append(0)\n",
    "                chi_8.append(0)\n",
    "                chi_10.append(0)\n",
    "                counts[2] += 1\n",
    "            elif rank == 4:\n",
    "                chi_1.append(0)\n",
    "                chi_2.append(0)\n",
    "                chi_3.append(0)\n",
    "                chi_4.append(1)\n",
    "                chi_5.append(0)\n",
    "                chi_6.append(0)\n",
    "                chi_7.append(0)\n",
    "                chi_8.append(0)\n",
    "                chi_10.append(0)\n",
    "                counts[3] += 1\n",
    "            elif rank == 5:\n",
    "                chi_1.append(0)\n",
    "                chi_2.append(0)\n",
    "                chi_3.append(0)\n",
    "                chi_4.append(0)\n",
    "                chi_5.append(1)\n",
    "                chi_6.append(0)\n",
    "                chi_7.append(0)\n",
    "                chi_8.append(0)\n",
    "                chi_10.append(0)\n",
    "                counts[4] += 1\n",
    "            elif rank == 6:\n",
    "                chi_1.append(0)\n",
    "                chi_2.append(0)\n",
    "                chi_3.append(0)\n",
    "                chi_4.append(0)\n",
    "                chi_5.append(0)\n",
    "                chi_6.append(1)\n",
    "                chi_7.append(0)\n",
    "                chi_8.append(0)\n",
    "                chi_10.append(0)\n",
    "                counts[5] += 1\n",
    "            elif rank == 7:\n",
    "                chi_1.append(0)\n",
    "                chi_2.append(0)\n",
    "                chi_3.append(0)\n",
    "                chi_4.append(0)\n",
    "                chi_5.append(0)\n",
    "                chi_6.append(0)\n",
    "                chi_7.append(1)\n",
    "                chi_8.append(0)\n",
    "                chi_10.append(0)\n",
    "                counts[6] += 1\n",
    "            elif rank == 8:\n",
    "                chi_1.append(0)\n",
    "                chi_2.append(0)\n",
    "                chi_3.append(0)\n",
    "                chi_4.append(0)\n",
    "                chi_5.append(0)\n",
    "                chi_6.append(0)\n",
    "                chi_7.append(0)\n",
    "                chi_8.append(1)\n",
    "                chi_10.append(0)\n",
    "                counts[7] += 1\n",
    "            elif rank == 10:\n",
    "                chi_1.append(0)\n",
    "                chi_2.append(0)\n",
    "                chi_3.append(0)\n",
    "                chi_4.append(0)\n",
    "                chi_5.append(0)\n",
    "                chi_6.append(0)\n",
    "                chi_7.append(0)\n",
    "                chi_8.append(0)\n",
    "                chi_10.append(1)\n",
    "                counts[8] += 1\n",
    "            else:\n",
    "                chi_1.append(0)\n",
    "                chi_2.append(0)\n",
    "                chi_3.append(0)\n",
    "                chi_4.append(0)\n",
    "                chi_5.append(0)\n",
    "                chi_6.append(0)\n",
    "                chi_7.append(0)\n",
    "                chi_8.append(0)\n",
    "                chi_10.append(0)\n",
    "                counts[9] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = pd.DataFrame()\n",
    "X['mean reviewer score'] = scores\n",
    "X['rank 1'] = chi_1\n",
    "X['rank 2'] = chi_2\n",
    "X['rank 3'] = chi_3\n",
    "X['rank 4'] = chi_4\n",
    "X['rank 5'] = chi_5\n",
    "X['rank 6'] = chi_6\n",
    "X['rank 7'] = chi_7\n",
    "X['rank 8 (tied)'] = chi_8\n",
    "X['rank 10'] = chi_10\n",
    "X['constant'] = [1] * len(scores)\n",
    "y = np.array(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.211132\n",
      "         Iterations 9\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   No. Observations:                 3139\n",
      "Model:                          Logit   Df Residuals:                     3128\n",
      "Method:                           MLE   Df Model:                           10\n",
      "Date:                Fri, 02 Oct 2020   Pseudo R-squ.:                  0.6459\n",
      "Time:                        02:13:37   Log-Likelihood:                -662.74\n",
      "converged:                       True   LL-Null:                       -1871.6\n",
      "Covariance Type:            nonrobust   LLR p-value:                     0.000\n",
      "=======================================================================================\n",
      "                          coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------------\n",
      "mean reviewer score     2.4600      0.099     24.949      0.000       2.267       2.653\n",
      "rank 1                  0.7510      0.363      2.071      0.038       0.040       1.462\n",
      "rank 2                  0.7498      0.357      2.100      0.036       0.050       1.450\n",
      "rank 3                  0.6698      0.535      1.252      0.211      -0.379       1.718\n",
      "rank 4                 -0.0312      0.371     -0.084      0.933      -0.759       0.696\n",
      "rank 5                  0.2299      0.337      0.681      0.496      -0.432       0.891\n",
      "rank 6                 -0.8189      0.684     -1.198      0.231      -2.159       0.521\n",
      "rank 7                  1.4267      0.607      2.350      0.019       0.237       2.617\n",
      "rank 8 (tied)          -0.0118      0.369     -0.032      0.974      -0.735       0.711\n",
      "rank 10                 0.0608      0.640      0.095      0.924      -1.193       1.314\n",
      "constant              -13.4048      0.535    -25.038      0.000     -14.454     -12.356\n",
      "=======================================================================================\n",
      "[123, 99, 46, 95, 119, 31, 37, 131, 51, 2407] 3139\n"
     ]
    }
   ],
   "source": [
    "# fit logistic regression model\n",
    "logreg = sm.Logit(y, X)\n",
    "result = logreg.fit()\n",
    "print(result.summary())\n",
    "# print counts of each top 10 school and not top 10 schools\n",
    "print(counts, sum(counts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy of logistic regression classifier on test set: 0.92\n"
     ]
    }
   ],
   "source": [
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n",
    "logreg = LogisticRegression(solver='newton-cg', fit_intercept = True, penalty='none')\n",
    "result = logreg.fit(X_train, y_train)\n",
    "\n",
    "y_pred = result.predict(X_test)\n",
    "\n",
    "print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = []\n",
    "google_ind = []\n",
    "facebook_ind = []\n",
    "microsoft_ind = []\n",
    "\n",
    "y = []\n",
    "for institution, ratings, decision, year in zip(data['institution'], data['ratings'], data['decisions'], data['year']):\n",
    "    if pd.notnull(institution) and pd.notnull(ratings) and year == 2020:\n",
    "        # get mean reviewer score\n",
    "        rates = ratings.split(';')\n",
    "        rates = [int(x) for x in rates]\n",
    "        rating_avg = np.average(rates)\n",
    "        \n",
    "        # get distinct institutions\n",
    "        institutions = institution.split(';')\n",
    "        institution_set = set(institutions)\n",
    "        \n",
    "        # get decision\n",
    "        binary_decision = decision_to_binary.get(decision)\n",
    "        \n",
    "        # for each distinct institution\n",
    "        for inst in institution_set:\n",
    "            scores.append(rating_avg)\n",
    "            y.append(binary_decision)\n",
    "            \n",
    "            # compute Google, Facebook, and Microsoft indicators\n",
    "            if inst == 'Google':\n",
    "                google_ind.append(1)\n",
    "                facebook_ind.append(0)\n",
    "                microsoft_ind.append(0)\n",
    "            elif inst == 'Facebook':\n",
    "                google_ind.append(0)\n",
    "                facebook_ind.append(1)\n",
    "                microsoft_ind.append(0)\n",
    "            elif inst == 'Microsoft':\n",
    "                google_ind.append(0)\n",
    "                facebook_ind.append(0)\n",
    "                microsoft_ind.append(1)\n",
    "            else:\n",
    "                google_ind.append(0)\n",
    "                facebook_ind.append(0)\n",
    "                microsoft_ind.append(0)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "303 110 92\n"
     ]
    }
   ],
   "source": [
    "X = pd.DataFrame()\n",
    "X['mean reviewer score'] = scores\n",
    "X['google'] = google_ind\n",
    "X['facebook'] = facebook_ind\n",
    "X['microsoft'] = microsoft_ind\n",
    "X['constant'] = [1] * len(scores)\n",
    "\n",
    "# count how many papers from each institution\n",
    "print(np.sum(google_ind), np.sum(facebook_ind), np.sum(microsoft_ind))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.216186\n",
      "         Iterations 9\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   No. Observations:                 4876\n",
      "Model:                          Logit   Df Residuals:                     4871\n",
      "Method:                           MLE   Df Model:                            4\n",
      "Date:                Fri, 02 Oct 2020   Pseudo R-squ.:                  0.6339\n",
      "Time:                        02:13:37   Log-Likelihood:                -1054.1\n",
      "converged:                       True   LL-Null:                       -2879.6\n",
      "Covariance Type:            nonrobust   LLR p-value:                     0.000\n",
      "=======================================================================================\n",
      "                          coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------------\n",
      "mean reviewer score     2.3515      0.075     31.557      0.000       2.205       2.498\n",
      "google                 -0.0177      0.209     -0.085      0.932      -0.427       0.391\n",
      "facebook                0.1745      0.362      0.482      0.630      -0.536       0.885\n",
      "microsoft              -1.1837      0.402     -2.948      0.003      -1.971      -0.397\n",
      "constant              -12.7907      0.402    -31.843      0.000     -13.578     -12.003\n",
      "=======================================================================================\n"
     ]
    }
   ],
   "source": [
    "# fit logistic regression model\n",
    "logreg = sm.Logit(y, X)\n",
    "result = logreg.fit()\n",
    "print(result.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy of logistic regression classifier on test set: 0.90\n"
     ]
    }
   ],
   "source": [
    "# compute accuracy on test set\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n",
    "logreg = LogisticRegression(solver='newton-cg', fit_intercept = True, penalty='none')\n",
    "result = logreg.fit(X_train, y_train)\n",
    "\n",
    "y_pred = result.predict(X_test)\n",
    "\n",
    "print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
