{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import statsmodels.api as sm\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load dataset with arXiv submission data\n",
    "data = pd.read_csv('../dataset_arxiv.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert decision to binary classes accept, reject\n",
    "decision_to_binary = {'Accept (Poster)': 1, 'Accept (Oral)': 1, 'Accept (Talk)': 1, 'Accept (Spotlight)': 1, 'Invite to Workshop Track': 0, 'Withdrawn': 0, 'Reject': 0, 'accept': 1, 'reject': 0}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = []\n",
    "arXiv = []\n",
    "\n",
    "cnt = 0\n",
    "tot = 0\n",
    "\n",
    "y = []\n",
    "for ratings, rankings, decision, year, days, versions in zip(data['ratings'], data['csranking'], data['decisions'], data['year'], data['days'], data['versions']):\n",
    "    if year == 2020 and pd.notnull(rankings):\n",
    "        # get decision\n",
    "        binary_decision = decision_to_binary.get(decision)\n",
    "        \n",
    "        # get distinct ranks on the paper\n",
    "        ranks = rankings.split(';')\n",
    "        ranks = [int(x) for x in ranks]\n",
    "        ranks_set = set(ranks)\n",
    "        ranks_set.discard(-1)\n",
    "     \n",
    "        # get mean reviewer score\n",
    "        rates = ratings.split(';')\n",
    "        rates = [int(x) for x in rates]\n",
    "        rating_avg = np.average(rates)\n",
    "        \n",
    "        # for each distinct rank\n",
    "        for rank in ranks_set:\n",
    "\n",
    "            # uncomment for which subset of ranks to study\n",
    "            \n",
    "            # CMU MIT Cornell\n",
    "            if rank != 1 and rank != 2 and rank != 7:\n",
    "                continue\n",
    "#             # not top 10\n",
    "#             if 1 <= rank <= 10:\n",
    "#                 continue\n",
    "#             # top 10\n",
    "#             if rank > 10:\n",
    "#                 continue\n",
    "#             # top 10 excluding CMU MIT Cornell\n",
    "#             if rank > 10 or rank == 1 or rank == 2 or rank == 7:\n",
    "#                 continue\n",
    "            \n",
    "            # not on arxiv\n",
    "            if pd.isnull(versions):\n",
    "                arXiv.append(0)\n",
    "            \n",
    "            # if on arxiv, check if visible during the review process\n",
    "            else:\n",
    "                # count total on arxiv and how many were visible\n",
    "                if days >= -7:\n",
    "                    arXiv.append(1)\n",
    "                    cnt += 1\n",
    "                else:\n",
    "                    arXiv.append(0)    \n",
    "                tot += 1\n",
    "                \n",
    "            scores.append(rating_avg)\n",
    "            y.append(binary_decision)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = pd.DataFrame()\n",
    "X['mean reviewer score'] = scores\n",
    "X['arXiv'] = arXiv\n",
    "X['constant'] = [1] * len(scores)\n",
    "y = np.array(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "82 161 0.5093167701863354 259\n"
     ]
    }
   ],
   "source": [
    "print(cnt, tot, cnt/tot, len(scores))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.203377\n",
      "         Iterations 9\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   No. Observations:                  259\n",
      "Model:                          Logit   Df Residuals:                      256\n",
      "Method:                           MLE   Df Model:                            2\n",
      "Date:                Fri, 02 Oct 2020   Pseudo R-squ.:                  0.6934\n",
      "Time:                        02:14:03   Log-Likelihood:                -52.675\n",
      "converged:                       True   LL-Null:                       -171.79\n",
      "Covariance Type:            nonrobust   LLR p-value:                 1.865e-52\n",
      "=======================================================================================\n",
      "                          coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------------\n",
      "mean reviewer score     2.7144      0.372      7.295      0.000       1.985       3.444\n",
      "arXiv                   1.8285      0.551      3.317      0.001       0.748       2.909\n",
      "constant              -14.4988      1.967     -7.372      0.000     -18.353     -10.644\n",
      "=======================================================================================\n"
     ]
    }
   ],
   "source": [
    "# fit logistic regression model\n",
    "logreg = sm.Logit(y, X)\n",
    "result = logreg.fit()\n",
    "print(result.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy of logistic regression classifier on test set: 0.94\n"
     ]
    }
   ],
   "source": [
    "# compute test accuracy\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n",
    "logreg = LogisticRegression(solver='newton-cg', fit_intercept = True, penalty='none')\n",
    "result = logreg.fit(X_train, y_train)\n",
    "\n",
    "y_pred = result.predict(X_test)\n",
    "\n",
    "print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = []\n",
    "visible = []\n",
    "arXiv = []\n",
    "\n",
    "cnt = 0\n",
    "tot = 0\n",
    "\n",
    "y = []\n",
    "for ratings, institution, decision, year, days, versions in zip(data['ratings'], data['institution'], data['decisions'], data['year'], data['days'], data['versions']):\n",
    "    if year == 2020 and pd.notnull(rankings) and pd.notnull(institution):\n",
    "        # get decision\n",
    "        binary_decision = decision_to_binary.get(decision)\n",
    "        \n",
    "        # get institutions\n",
    "        institutions = institution.split(';')\n",
    "        institution_set = set(institutions)\n",
    "        \n",
    "        # get mean reviewer score\n",
    "        rates = ratings.split(';')\n",
    "        rates = [int(x) for x in rates]\n",
    "        rating_avg = np.average(rates)\n",
    "        \n",
    "        # for each distinct institution\n",
    "        for inst in institution_set:\n",
    "            # uncomment which subset to study\n",
    "            \n",
    "#             if inst != 'Facebook':\n",
    "#                 continue\n",
    "#             if inst != 'Microsoft':\n",
    "#                 continue\n",
    "            if inst != 'Google':\n",
    "                continue\n",
    "            \n",
    "            # not on arXiv\n",
    "            if pd.isnull(versions):\n",
    "                arXiv.append(0)\n",
    "                \n",
    "            # if on arxiv, check if visible during the review process\n",
    "            else:\n",
    "                if days >= -7:\n",
    "                    arXiv.append(1)\n",
    "                    cnt += 1\n",
    "                else:\n",
    "                    arXiv.append(0)    \n",
    "                tot += 1\n",
    "                \n",
    "            scores.append(rating_avg)\n",
    "            y.append(binary_decision)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = pd.DataFrame()\n",
    "X['mean reviewer score'] = scores\n",
    "X['arXiv'] = arXiv\n",
    "X['constant'] = [1] * len(scores)\n",
    "y = np.array(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "105 217 0.4838709677419355\n"
     ]
    }
   ],
   "source": [
    "print(cnt, tot, cnt/tot)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.258031\n",
      "         Iterations 8\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   No. Observations:                  303\n",
      "Model:                          Logit   Df Residuals:                      300\n",
      "Method:                           MLE   Df Model:                            2\n",
      "Date:                Fri, 02 Oct 2020   Pseudo R-squ.:                  0.6172\n",
      "Time:                        02:14:03   Log-Likelihood:                -78.183\n",
      "converged:                       True   LL-Null:                       -204.24\n",
      "Covariance Type:            nonrobust   LLR p-value:                 1.792e-55\n",
      "=======================================================================================\n",
      "                          coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------------\n",
      "mean reviewer score     2.4563      0.295      8.337      0.000       1.879       3.034\n",
      "arXiv                  -0.0566      0.418     -0.135      0.892      -0.877       0.763\n",
      "constant              -13.3595      1.626     -8.218      0.000     -16.546     -10.173\n",
      "=======================================================================================\n"
     ]
    }
   ],
   "source": [
    "# fit logistic regression model\n",
    "logreg = sm.Logit(y, X)\n",
    "result = logreg.fit()\n",
    "print(result.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy of logistic regression classifier on test set: 0.91\n"
     ]
    }
   ],
   "source": [
    "# compute test accuracy\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n",
    "logreg = LogisticRegression(solver='newton-cg', fit_intercept = True, penalty='none')\n",
    "result = logreg.fit(X_train, y_train)\n",
    "\n",
    "y_pred = result.predict(X_test)\n",
    "\n",
    "print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
