{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy.stats import ttest_ind\n",
    "from datasets import load_dataset\n",
    "import json\n",
    "from collections import defaultdict\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_path = '~/user'\n",
    "\n",
    "summarizers = ['textrank', 'matchsum', 'presumm_ext', 'presumm_abs', 'bart', 'pegasus', 'azure']\n",
    "filters = ['overall', 'noun', 'pron', 'noun_pron', 'adj']\n",
    "\n",
    "with open('../data/groups_cased.json', 'r') as f:\n",
    "    groups = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gender = ['men', 'women']\n",
    "ethnicity = ['black', 'white', 'hispanic', 'asian']\n",
    "religion = ['islam', 'christianity', 'jewish']\n",
    "other = ethnicity + religion\n",
    "gender_cols = [f'{x}_art' for x in gender]\n",
    "other_cols = [f'{x}_art' for x in other]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pd.read_csv('~/user/textrank_analysis/comparison_similarities_pron_noun.csv', index_col=0, usecols=range(1,20))\n",
    "df2 = pd.read_csv('~/user/textrank_analysis/comparison_similarities_adj_noun.csv', index_col=0, usecols=range(1,20))\n",
    "df = df1[gender_cols].copy()\n",
    "df[other_cols] = df2[other_cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "keywords = pd.read_csv('~/user/cnn_dm_analysis/keyword_counts.csv', index_col=0)\n",
    "\n",
    "def get_ids(groupname):\n",
    "    return keywords[keywords[groupname] > 0].index.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_association_threshold(q):\n",
    "    association_thresholds = {}\n",
    "\n",
    "    for g in groups:\n",
    "        threshold = df[f'{g}_art'].loc[get_ids(g)].quantile(q)\n",
    "        association_thresholds[g] = threshold\n",
    "\n",
    "    return association_thresholds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "association_thresholds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('../data/association_thresholds.json', 'r') as f:\n",
    "    t = json.load(f)\n",
    "t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.6.8 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
