{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('../dataset_final.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert decision to binary classes accept, reject\n",
    "decision_to_binary = {'Accept (Poster)': 1, 'Accept (Oral)': 1, 'Accept (Talk)': 1, 'Accept (Spotlight)': 1, 'Invite to Workshop Track': 0, 'Withdrawn': 0, 'Reject': 0}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "citpubff = []\n",
    "citpubfl = []\n",
    "citpubmf = []\n",
    "citpubml = []\n",
    "citsff = []\n",
    "citsfl = []\n",
    "citsmf = []\n",
    "citsml = []\n",
    "cnt = 0\n",
    "for p_citations, cits, pubs, h_indices, year, gender in zip(data['citations'], data['authors_citations'], data['authors_publications'], data['authors_h-index'],data['year'], data['genders']):\n",
    "        if year != 2020:\n",
    "            continue\n",
    "        \n",
    "        # author statistics, gender\n",
    "        citations = cits.split(';')\n",
    "        publications = pubs.split(';')\n",
    "        h_index = h_indices.split(';')\n",
    "        genders = gender.split(';')\n",
    "        \n",
    "        # count female first and last authors\n",
    "        if genders[0] == 'f' and genders[-1] == 'f':\n",
    "            cnt+=1\n",
    "        \n",
    "        # get female first author stats\n",
    "        if genders[0] == 'f':\n",
    "            if int(publications[0]) != 0 and int(publications[0]) != -1 and int(citations[0]) != -1:\n",
    "                citpubff.append(int(citations[0])/int(publications[0]))\n",
    "            if int(p_citations) != -1:\n",
    "                citsff.append(int(p_citations))\n",
    "        # get female last author stats\n",
    "        if genders[-1] == 'f':\n",
    "            if int(publications[-1]) != 0 and int(publications[-1]) != -1 and int(citations[-1]) != -1:\n",
    "                citpubfl.append(int(citations[-1])/int(publications[-1]))\n",
    "            if int(p_citations) != -1:\n",
    "                citsfl.append(int(p_citations))\n",
    "        \n",
    "        # get male first author stats\n",
    "        if genders[0] == 'm':\n",
    "            if int(publications[0]) != 0 and int(publications[0]) != -1 and int(citations[0]) != -1:\n",
    "                citpubmf.append(int(citations[0])/int(publications[0]))\n",
    "            if int(p_citations) != -1:\n",
    "                citsmf.append(int(p_citations))\n",
    "        # get male last author stats\n",
    "        if genders[-1] == 'm':\n",
    "            if int(publications[-1]) != 0 and int(publications[-1]) != -1 and int(citations[-1]) != -1:\n",
    "                citpubml.append(int(citations[-1])/int(publications[-1]))\n",
    "            if int(p_citations) != -1:\n",
    "                citsml.append(int(p_citations))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# female first and last authors\n",
      "38\n",
      "\n",
      "citations female first author: mean, median, count\n",
      "6.2164179104477615 0.0 268\n",
      "\n",
      "citations female last author: mean, median, count\n",
      "4.159663865546219 1.0 238\n",
      "\n",
      "citations male first author: mean, median, count\n",
      "4.480890179003387 1.0 2067\n",
      "\n",
      "citations male lase author: mean, median, count\n",
      "4.728150873965041 1.0 2174\n",
      "\n",
      "\n",
      "\n",
      "citations/publication female first author: mean, median, count\n",
      "11.77200697818576 5.067274305555555 264\n",
      "\n",
      "citations/publication female last author: mean, median, count\n",
      "33.42697553793458 16.891797556719023 235\n",
      "\n",
      "citations/publication male first author: mean, median, count\n",
      "15.16114169806239 5.757575757575758 2041\n",
      "\n",
      "citations/publication male last author: mean, median, count\n",
      "44.09177635450678 19.422287390029325 2147\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print('# female first and last authors')\n",
    "print(cnt)\n",
    "print()\n",
    "\n",
    "print('citations female first author: mean, median, count')\n",
    "print(np.mean(citsff), np.median(citsff), len(citsff))\n",
    "print()\n",
    "print('citations female last author: mean, median, count')\n",
    "print(np.mean(citsfl), np.median(citsfl), len(citsfl))\n",
    "print()\n",
    "print('citations male first author: mean, median, count')\n",
    "print(np.mean(citsmf), np.median(citsmf), len(citsmf))\n",
    "print()\n",
    "print('citations male lase author: mean, median, count')\n",
    "print(np.mean(citsml), np.median(citsml), len(citsml))\n",
    "print()\n",
    "print()\n",
    "print()\n",
    "print('citations/publication female first author: mean, median, count')\n",
    "print(np.mean(citpubff), np.median(citpubff), len(citpubff))\n",
    "print()\n",
    "print('citations/publication female last author: mean, median, count')\n",
    "print(np.mean(citpubfl), np.median(citpubfl), len(citpubfl))\n",
    "print()\n",
    "print('citations/publication male first author: mean, median, count')\n",
    "print(np.mean(citpubmf), np.median(citpubmf), len(citpubmf))\n",
    "print()\n",
    "print('citations/publication male last author: mean, median, count')\n",
    "print(np.mean(citpubml), np.median(citpubml), len(citpubml))\n",
    "print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "accepted = 0\n",
    "tot = 0\n",
    "mf = 0\n",
    "ff = 0\n",
    "ml = 0\n",
    "fl = 0\n",
    "mfa = 0\n",
    "ffa = 0\n",
    "mla = 0\n",
    "fla = 0\n",
    "uf = 0\n",
    "ul = 0\n",
    "l_mf = []\n",
    "l_ff = []\n",
    "l_ml = []\n",
    "l_fl = []\n",
    "l_mfa = []\n",
    "l_ffa = []\n",
    "l_mla = []\n",
    "l_fla = []\n",
    "for gender, decision, ratings, year in zip(data['genders'], data['decisions'], data['ratings'], data['year']):\n",
    "    if year != 2020:\n",
    "        continue\n",
    "    \n",
    "    # get decision\n",
    "    binary_decision = decision_to_binary.get(decision)\n",
    "    \n",
    "    # get genders\n",
    "    genders = gender.split(';')\n",
    "    \n",
    "    # get mean reviewer score\n",
    "    rates = ratings.split(';')\n",
    "    rates = [int(x) for x in rates]\n",
    "    rating_avg = np.average(rates)\n",
    "    \n",
    "    # count total data points\n",
    "    tot += 1\n",
    "    \n",
    "    # count unlabeled first authors\n",
    "    if genders[0] != 'm' and genders[0] != 'f':\n",
    "        uf += 1\n",
    "    \n",
    "    # count unlabeled last authors\n",
    "    if genders[-1] != 'm' and genders[-1] != 'f':\n",
    "        ul += 1\n",
    "    \n",
    "    # get male, female first author scores\n",
    "    if genders[0] == 'm':\n",
    "        mf += 1\n",
    "        l_mf.append(rating_avg)\n",
    "    if genders[0] == 'f':\n",
    "        ff += 1\n",
    "        l_ff.append(rating_avg)\n",
    "\n",
    "    # get male, female last author scores\n",
    "    if genders[-1] == 'm':\n",
    "        ml += 1\n",
    "        l_ml.append(rating_avg)\n",
    "    if genders[-1] == 'f':\n",
    "        fl += 1\n",
    "        l_fl.append(rating_avg)\n",
    "    \n",
    "    # if accepted\n",
    "    if binary_decision:\n",
    "        accepted += 1\n",
    "        \n",
    "        # get male, female first author scores\n",
    "        if genders[0] == 'm':\n",
    "            mfa += 1\n",
    "            l_mfa.append(rating_avg)\n",
    "        if genders[0] == 'f':\n",
    "            ffa += 1\n",
    "            l_ffa.append(rating_avg)\n",
    "            \n",
    "        # get male, female last author scores\n",
    "        if genders[-1] == 'm':\n",
    "            mla += 1\n",
    "            l_mla.append(rating_avg)\n",
    "        if genders[-1] == 'f':\n",
    "            fla += 1\n",
    "            l_fla.append(rating_avg)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "percent of papers: m first, f first, m last, f last\n",
      "0.8375 0.10625 0.878125 0.096484375\n",
      "\n",
      "percent of accepted papers: m first, f first, m last, f last\n",
      "0.8442503639010189 0.09315866084425037 0.8850072780203785 0.09461426491994178\n",
      "\n",
      "acceptance rate: m first, f first, m last, f last\n",
      "0.27052238805970147 0.23529411764705882 0.2704626334519573 0.2631578947368421\n",
      "\n",
      "average score: m first, f first, m last, f last\n",
      "4.212336753731344 4.058639705882353 4.198509786476868 4.178407557354926\n",
      "\n",
      "average score of accepted papers: m first, f first, m last, f last\n",
      "6.268735632183908 6.14453125 6.248629385964913 6.171025641025641\n",
      "\n",
      "percent unlabled first, last\n",
      "0.05625 0.025390625\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print('percent of papers: m first, f first, m last, f last')\n",
    "print(mf/tot, ff/tot, ml/tot, fl/tot)\n",
    "print()\n",
    "print('percent of accepted papers: m first, f first, m last, f last')\n",
    "print(mfa/accepted, ffa/accepted, mla/accepted, fla/accepted)\n",
    "print()\n",
    "print('acceptance rate: m first, f first, m last, f last')\n",
    "print(mfa/mf, ffa/ff, mla/ml, fla/fl)\n",
    "print()\n",
    "print('average score: m first, f first, m last, f last')\n",
    "print(np.mean(l_mf), np.mean(l_ff), np.mean(l_ml), np.mean(l_fl))\n",
    "print()\n",
    "print('average score of accepted papers: m first, f first, m last, f last')\n",
    "print(np.mean(l_mfa), np.mean(l_ffa), np.mean(l_mla), np.mean(l_fla))\n",
    "print()\n",
    "print('percent unlabled first, last')\n",
    "print(uf/tot, ul/tot)\n",
    "print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "mf10 = 0\n",
    "mfnot = 0\n",
    "ff10 = 0\n",
    "ffnot = 0\n",
    "ml10 = 0\n",
    "mlnot = 0\n",
    "fl10 = 0\n",
    "flnot = 0\n",
    "tot = 0\n",
    "\n",
    "tot10f = 0\n",
    "tot10l = 0\n",
    "for gender, rankings, year in zip(data['genders'], data['csranking'], data['year']):\n",
    "    if pd.notnull(rankings) and year == 2020:\n",
    "        # count total data points\n",
    "        tot+=1\n",
    "        \n",
    "        # get distinct rankings\n",
    "        ranks = rankings.split(';')\n",
    "        ranks = [int(x) for x in ranks]\n",
    "        \n",
    "        # get genders\n",
    "        genders = gender.split(';')\n",
    "        \n",
    "        # if first author top 10\n",
    "        if 1<= ranks[0] <= 10:\n",
    "            tot10f+=1\n",
    "            \n",
    "            if genders[0] == 'm':\n",
    "                mf10 += 1\n",
    "\n",
    "            if genders[0] == 'f':\n",
    "                ff10 += 1\n",
    "        \n",
    "        # if last author top 10\n",
    "        if 1<= ranks[-1] <= 10:\n",
    "            tot10l+=1\n",
    "            \n",
    "            if genders[-1] == 'm':\n",
    "                ml10 += 1\n",
    "\n",
    "            if genders[-1] == 'f':\n",
    "                fl10 += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "percent of male first authors, top 10\n",
      "0.8395604395604396\n",
      "\n",
      "percent of female first authors, top 10\n",
      "0.12307692307692308\n",
      "\n",
      "percent of male last authors, top 10\n",
      "0.8402061855670103\n",
      "\n",
      "percent of female last authors, top 10\n",
      "0.13402061855670103\n",
      "\n",
      "total data points\n",
      "2551\n"
     ]
    }
   ],
   "source": [
    "print('percent of male first authors, top 10')\n",
    "print(mf10/tot10f)\n",
    "print()\n",
    "print('percent of female first authors, top 10')\n",
    "print(ff10/tot10f)\n",
    "print()\n",
    "\n",
    "print('percent of male last authors, top 10')\n",
    "print(ml10/tot10l)\n",
    "print()\n",
    "print('percent of female last authors, top 10')\n",
    "print(fl10/tot10l)\n",
    "print()\n",
    "print('total data points')\n",
    "print(tot)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
