{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get data in"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ratings</th>\n",
       "      <th>genders</th>\n",
       "      <th>average_score</th>\n",
       "      <th>first_author_gender</th>\n",
       "      <th>last_author_gender</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3008</th>\n",
       "      <td>6;8;6</td>\n",
       "      <td>m;m</td>\n",
       "      <td>6.7</td>\n",
       "      <td>m</td>\n",
       "      <td>m</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3009</th>\n",
       "      <td>8;8</td>\n",
       "      <td>m;m</td>\n",
       "      <td>8.0</td>\n",
       "      <td>m</td>\n",
       "      <td>m</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3010</th>\n",
       "      <td>8;3;8;8</td>\n",
       "      <td>m;m</td>\n",
       "      <td>6.8</td>\n",
       "      <td>m</td>\n",
       "      <td>m</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      ratings genders  average_score first_author_gender last_author_gender\n",
       "3008    6;8;6     m;m            6.7                   m                  m\n",
       "3009      8;8     m;m            8.0                   m                  m\n",
       "3010  8;3;8;8     m;m            6.8                   m                  m"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# read data\n",
    "df = pd.read_csv(\"../dataset_final.csv\")\n",
    "\n",
    "# select subset of data with labelled genders.\n",
    "data_20 = df[(df['year'] == 2020) & (df['genders'] != \"-1;-1\")][['ratings', 'genders']]\n",
    "\n",
    "# calculate average score\n",
    "def calculate_average_score(row):\n",
    "    ratings = [float(rating) for rating in row['ratings'].split(\";\")]\n",
    "    return round(sum(ratings) / len(ratings), 1)\n",
    "\n",
    "# parse author gender\n",
    "def get_author_gender_at(pos):\n",
    "    def parse_gender(row):\n",
    "        return row['genders'].split(\";\")[pos]\n",
    "    return parse_gender\n",
    "\n",
    "data_20['average_score'] = data_20.apply(calculate_average_score, axis = 1)\n",
    "data_20['first_author_gender'] = data_20.apply(get_author_gender_at(0), axis = 1)\n",
    "data_20['last_author_gender'] = data_20.apply(get_author_gender_at(-1), axis = 1)\n",
    "\n",
    "data_20.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of papers with first author is female 272\n",
      "Total number of papers with first author is male 2145\n"
     ]
    }
   ],
   "source": [
    "print(\"Total number of papers with first author is female\", len(data_20[data_20['first_author_gender'] == \"f\"]))\n",
    "print(\"Total number of papers with first author is male\", len(data_20[data_20['first_author_gender'] == \"m\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of papers with last author is female 247\n",
      "Total number of papers with last author is male 2249\n"
     ]
    }
   ],
   "source": [
    "print(\"Total number of papers with last author is female\", len(data_20[data_20['last_author_gender'] == \"f\"]))\n",
    "print(\"Total number of papers with last author is male\", len(data_20[data_20['last_author_gender'] == \"m\"]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Mann-Whitney U test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import scipy.stats as stats\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "male_first = data_20[data_20['first_author_gender'] == \"m\"]['average_score']\n",
    "female_first = data_20[data_20['first_author_gender'] == \"f\"]['average_score']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average score of papers with first author is female 4.056985294117647\n",
      "Average score of papers with first author is male 4.2129603729603735\n"
     ]
    }
   ],
   "source": [
    "print(\"Average score of papers with first author is female\", female_first.mean())\n",
    "print(\"Average score of papers with first author is male\", male_first.mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MannwhitneyuResult(statistic=276796.5, pvalue=0.08349912595964876)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stats.mannwhitneyu(female_first, male_first)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "male_last = data_20[data_20['last_author_gender'] == \"m\"]['average_score']\n",
    "female_last = data_20[data_20['last_author_gender'] == \"f\"]['average_score']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average score of papers with last author is female 4.178137651821863\n",
      "Average score of papers with last author is male 4.198932859048466\n"
     ]
    }
   ],
   "source": [
    "print(\"Average score of papers with last author is female\", female_last.mean())\n",
    "print(\"Average score of papers with last author is male\", male_last.mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MannwhitneyuResult(statistic=275505.5, pvalue=0.41695158116074954)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stats.mannwhitneyu(female_last, male_last)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
