{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy.stats import norm\n",
    "import numpy as np\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_categories = {\n",
    "    'gender':['men', 'women'],\n",
    "    'race':['black', 'white', 'hispanic', 'asian'],\n",
    "    'religion':['judaism', 'islam', 'christianity']\n",
    "}\n",
    "summarizers = ['textrank', 'presumm', 'matchsum', 'azure', 'bart', 'pegasus', 'gpt3']\n",
    "percents = [0.1,0.5,0.9]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('../data/synthetic_data/analysis/articles/men/association_scores_pron_noun.csv', index_col=0)[['men', 'women']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_group_diffs(path,group_list):\n",
    "    df = pd.read_csv(path, index_col=0)[group_list]\n",
    "    diffs = df.apply(lambda x: 1-(x.max() - x) / x.max(), axis=1)\n",
    "    diffs = df.apply(lambda x: 1-(x.max() - x)/x.max(), axis=1)\n",
    "    means = diffs.mean()\n",
    "    stds = diffs.std()\n",
    "    cis = norm.interval(0.05, loc=means, scale=stds)\n",
    "    return means.to_list(), (means - cis[0]).to_list()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "for cat in group_categories:\n",
    "    data = {g:[] for g in group_categories}\n",
    "    pos_filter = 'adj' if cat != 'gender' else 'pron'\n",
    "    for group in group_categories[cat]:\n",
    "        data = []\n",
    "        path = f'../data/synthetic_data/analysis/articles/{group}/association_scores_{pos_filter}_noun.csv'\n",
    "        means, cis = get_group_diffs(path, group_categories[cat])\n",
    "\n",
    "        for i in range(len(cis)):\n",
    "            data = [group_categories[cat][i], means[i], cis[i]]\n",
    "            df = pd.DataFrame([data], columns=['group', 'similarity', 'ci95'])\n",
    "            df.to_csv(f'../data/synthetic_data/plot_data/single_group/{group}_baselines_{group_categories[cat][i]}.dat', sep=' ', index=None)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "for cat in group_categories:\n",
    "    pos_filter = 'adj' if cat != 'gender' else 'pron'\n",
    "    for group in group_categories[cat]:\n",
    "        for percent in percents:\n",
    "            data = {g:[] for g in group_categories[cat]}\n",
    "            for summarizer in summarizers:\n",
    "                if summarizer in ['matchsum', 'presumm'] and percent != percents[0]:\n",
    "                    continue\n",
    "                elif summarizer in ['matchsum', 'presumm']:\n",
    "                    path = f'../data/synthetic_data/analysis/{summarizer}/{group}/association_scores_{pos_filter}_noun.csv'\n",
    "                else:\n",
    "                    path = f'../data/synthetic_data/analysis/{summarizer}/{percent}_{group}/association_scores_{pos_filter}_noun.csv'\n",
    "                \n",
    "                diffs, cis = get_group_diffs(path, group_categories[cat])\n",
    "\n",
    "                for i in range(len(group_categories[cat])):\n",
    "                    data[group_categories[cat][i]].append([summarizer, group_categories[cat][i], diffs[i], cis[i]])\n",
    "        \n",
    "            for g in group_categories[cat]:\n",
    "                df = pd.DataFrame(data[g], columns=['summarizer', 'group', 'similarity', 'ci95'])\n",
    "                df.to_csv(f'../data/synthetic_data/plot_data/single_group/{percent}_{group}_summaries_{g}.dat', sep=' ', index=None)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratios = [0.1,0.5,0.9]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "for cat in group_categories:\n",
    "    data = {g:[] for g in group_categories}\n",
    "    pos_filter = 'adj' if cat != 'gender' else 'pron'\n",
    "    for r in ratios:\n",
    "        for g1 in group_categories[cat]:\n",
    "            for g2 in group_categories[cat]:\n",
    "                if g1 == g2:\n",
    "                    continue\n",
    "                data = []\n",
    "                path = f'../data/synthetic_data/analysis/articles/{r:.2f}_{g1}_{1-r:.2f}_{g2}/association_scores_{pos_filter}_noun.csv'\n",
    "                means, cis = get_group_diffs(path, group_categories[cat])\n",
    "\n",
    "                for i in range(len(cis)):\n",
    "                    data = [group_categories[cat][i], means[i], cis[i]]\n",
    "                    df = pd.DataFrame([data], columns=['group', 'similarity', 'ci95'])\n",
    "                    df.to_csv(f'../data/synthetic_data/plot_data/multigroup/{r:.2f}_{g1}_{1-r:.2f}_{g2}_baselines_{group_categories[cat][i]}.dat', sep=' ', index=None)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: '../data/synthetic_data/analysis/azure/0.1_0.10_black_0.90_white/association_scores_adj_noun.csv'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m/home/user/project_2021_nlp-summarization-bias/code/single_group_plots.ipynb Cell 9\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bmartin/home/user/project_2021_nlp-summarization-bias/code/single_group_plots.ipynb#X20sdnNjb2RlLXJlbW90ZQ%3D%3D?line=14'>15</a>\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bmartin/home/user/project_2021_nlp-summarization-bias/code/single_group_plots.ipynb#X20sdnNjb2RlLXJlbW90ZQ%3D%3D?line=15'>16</a>\u001b[0m     path \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39m../data/synthetic_data/analysis/\u001b[39m\u001b[39m{\u001b[39;00msummarizer\u001b[39m}\u001b[39;00m\u001b[39m/\u001b[39m\u001b[39m{\u001b[39;00mpercent\u001b[39m}\u001b[39;00m\u001b[39m_\u001b[39m\u001b[39m{\u001b[39;00mr\u001b[39m:\u001b[39;00m\u001b[39m.2f\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m_\u001b[39m\u001b[39m{\u001b[39;00mg1\u001b[39m}\u001b[39;00m\u001b[39m_\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39m1\u001b[39m\u001b[39m-\u001b[39mr\u001b[39m:\u001b[39;00m\u001b[39m.2f\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m_\u001b[39m\u001b[39m{\u001b[39;00mg2\u001b[39m}\u001b[39;00m\u001b[39m/association_scores_\u001b[39m\u001b[39m{\u001b[39;00mpos_filter\u001b[39m}\u001b[39;00m\u001b[39m_noun.csv\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m---> <a href='vscode-notebook-cell://ssh-remote%2Bmartin/home/user/project_2021_nlp-summarization-bias/code/single_group_plots.ipynb#X20sdnNjb2RlLXJlbW90ZQ%3D%3D?line=17'>18</a>\u001b[0m diffs, cis \u001b[39m=\u001b[39m get_group_diffs(path, group_categories[cat])\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bmartin/home/user/project_2021_nlp-summarization-bias/code/single_group_plots.ipynb#X20sdnNjb2RlLXJlbW90ZQ%3D%3D?line=19'>20</a>\u001b[0m \u001b[39mfor\u001b[39;00m i \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\u001b[39mlen\u001b[39m(group_categories[cat])):\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bmartin/home/user/project_2021_nlp-summarization-bias/code/single_group_plots.ipynb#X20sdnNjb2RlLXJlbW90ZQ%3D%3D?line=20'>21</a>\u001b[0m     data[group_categories[cat][i]]\u001b[39m.\u001b[39mappend([summarizer, group_categories[cat][i], diffs[i], cis[i]])\n",
      "\u001b[1;32m/home/user/project_2021_nlp-summarization-bias/code/single_group_plots.ipynb Cell 9\u001b[0m in \u001b[0;36mget_group_diffs\u001b[0;34m(path, group_list)\u001b[0m\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2Bmartin/home/user/project_2021_nlp-summarization-bias/code/single_group_plots.ipynb#X20sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget_group_diffs\u001b[39m(path,group_list):\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2Bmartin/home/user/project_2021_nlp-summarization-bias/code/single_group_plots.ipynb#X20sdnNjb2RlLXJlbW90ZQ%3D%3D?line=1'>2</a>\u001b[0m     df \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39;49mread_csv(path, index_col\u001b[39m=\u001b[39;49m\u001b[39m0\u001b[39;49m)[group_list]\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2Bmartin/home/user/project_2021_nlp-summarization-bias/code/single_group_plots.ipynb#X20sdnNjb2RlLXJlbW90ZQ%3D%3D?line=2'>3</a>\u001b[0m     diffs \u001b[39m=\u001b[39m df\u001b[39m.\u001b[39mapply(\u001b[39mlambda\u001b[39;00m x: \u001b[39m1\u001b[39m\u001b[39m-\u001b[39m(x\u001b[39m.\u001b[39mmax() \u001b[39m-\u001b[39m x) \u001b[39m/\u001b[39m x\u001b[39m.\u001b[39mmax(), axis\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m)\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2Bmartin/home/user/project_2021_nlp-summarization-bias/code/single_group_plots.ipynb#X20sdnNjb2RlLXJlbW90ZQ%3D%3D?line=3'>4</a>\u001b[0m     diffs \u001b[39m=\u001b[39m df\u001b[39m.\u001b[39mapply(\u001b[39mlambda\u001b[39;00m x: \u001b[39m1\u001b[39m\u001b[39m-\u001b[39m(x\u001b[39m.\u001b[39mmax() \u001b[39m-\u001b[39m x)\u001b[39m/\u001b[39mx\u001b[39m.\u001b[39mmax(), axis\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m)\n",
      "File \u001b[0;32m~/miniconda3/envs/summarization/lib/python3.9/site-packages/pandas/util/_decorators.py:311\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    305\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(args) \u001b[39m>\u001b[39m num_allow_args:\n\u001b[1;32m    306\u001b[0m     warnings\u001b[39m.\u001b[39mwarn(\n\u001b[1;32m    307\u001b[0m         msg\u001b[39m.\u001b[39mformat(arguments\u001b[39m=\u001b[39marguments),\n\u001b[1;32m    308\u001b[0m         \u001b[39mFutureWarning\u001b[39;00m,\n\u001b[1;32m    309\u001b[0m         stacklevel\u001b[39m=\u001b[39mstacklevel,\n\u001b[1;32m    310\u001b[0m     )\n\u001b[0;32m--> 311\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
      "File \u001b[0;32m~/miniconda3/envs/summarization/lib/python3.9/site-packages/pandas/io/parsers/readers.py:680\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m    665\u001b[0m kwds_defaults \u001b[39m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m    666\u001b[0m     dialect,\n\u001b[1;32m    667\u001b[0m     delimiter,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    676\u001b[0m     defaults\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mdelimiter\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39m,\u001b[39m\u001b[39m\"\u001b[39m},\n\u001b[1;32m    677\u001b[0m )\n\u001b[1;32m    678\u001b[0m kwds\u001b[39m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 680\u001b[0m \u001b[39mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n",
      "File \u001b[0;32m~/miniconda3/envs/summarization/lib/python3.9/site-packages/pandas/io/parsers/readers.py:575\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    572\u001b[0m _validate_names(kwds\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mnames\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m))\n\u001b[1;32m    574\u001b[0m \u001b[39m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 575\u001b[0m parser \u001b[39m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwds)\n\u001b[1;32m    577\u001b[0m \u001b[39mif\u001b[39;00m chunksize \u001b[39mor\u001b[39;00m iterator:\n\u001b[1;32m    578\u001b[0m     \u001b[39mreturn\u001b[39;00m parser\n",
      "File \u001b[0;32m~/miniconda3/envs/summarization/lib/python3.9/site-packages/pandas/io/parsers/readers.py:933\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m    930\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moptions[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m kwds[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m    932\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles: IOHandles \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m--> 933\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_engine \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_engine(f, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mengine)\n",
      "File \u001b[0;32m~/miniconda3/envs/summarization/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1217\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m   1213\u001b[0m     mode \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m   1214\u001b[0m \u001b[39m# error: No overload variant of \"get_handle\" matches argument types\u001b[39;00m\n\u001b[1;32m   1215\u001b[0m \u001b[39m# \"Union[str, PathLike[str], ReadCsvBuffer[bytes], ReadCsvBuffer[str]]\"\u001b[39;00m\n\u001b[1;32m   1216\u001b[0m \u001b[39m# , \"str\", \"bool\", \"Any\", \"Any\", \"Any\", \"Any\", \"Any\"\u001b[39;00m\n\u001b[0;32m-> 1217\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39m=\u001b[39m get_handle(  \u001b[39m# type: ignore[call-overload]\u001b[39;49;00m\n\u001b[1;32m   1218\u001b[0m     f,\n\u001b[1;32m   1219\u001b[0m     mode,\n\u001b[1;32m   1220\u001b[0m     encoding\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[1;32m   1221\u001b[0m     compression\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mcompression\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[1;32m   1222\u001b[0m     memory_map\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mmemory_map\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mFalse\u001b[39;49;00m),\n\u001b[1;32m   1223\u001b[0m     is_text\u001b[39m=\u001b[39;49mis_text,\n\u001b[1;32m   1224\u001b[0m     errors\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding_errors\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mstrict\u001b[39;49m\u001b[39m\"\u001b[39;49m),\n\u001b[1;32m   1225\u001b[0m     storage_options\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mstorage_options\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[1;32m   1226\u001b[0m )\n\u001b[1;32m   1227\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m   1228\u001b[0m f \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles\u001b[39m.\u001b[39mhandle\n",
      "File \u001b[0;32m~/miniconda3/envs/summarization/lib/python3.9/site-packages/pandas/io/common.py:789\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    784\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(handle, \u001b[39mstr\u001b[39m):\n\u001b[1;32m    785\u001b[0m     \u001b[39m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m    786\u001b[0m     \u001b[39m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m    787\u001b[0m     \u001b[39mif\u001b[39;00m ioargs\u001b[39m.\u001b[39mencoding \u001b[39mand\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m ioargs\u001b[39m.\u001b[39mmode:\n\u001b[1;32m    788\u001b[0m         \u001b[39m# Encoding\u001b[39;00m\n\u001b[0;32m--> 789\u001b[0m         handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39;49m(\n\u001b[1;32m    790\u001b[0m             handle,\n\u001b[1;32m    791\u001b[0m             ioargs\u001b[39m.\u001b[39;49mmode,\n\u001b[1;32m    792\u001b[0m             encoding\u001b[39m=\u001b[39;49mioargs\u001b[39m.\u001b[39;49mencoding,\n\u001b[1;32m    793\u001b[0m             errors\u001b[39m=\u001b[39;49merrors,\n\u001b[1;32m    794\u001b[0m             newline\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m    795\u001b[0m         )\n\u001b[1;32m    796\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    797\u001b[0m         \u001b[39m# Binary mode\u001b[39;00m\n\u001b[1;32m    798\u001b[0m         handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39m(handle, ioargs\u001b[39m.\u001b[39mmode)\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../data/synthetic_data/analysis/azure/0.1_0.10_black_0.90_white/association_scores_adj_noun.csv'"
     ]
    }
   ],
   "source": [
    "for cat in group_categories:\n",
    "    pos_filter = 'adj' if cat != 'gender' else 'pron'\n",
    "    for r in ratios:\n",
    "        for g1 in group_categories[cat]:\n",
    "            for g2 in group_categories[cat]:\n",
    "                if g1 == g2:\n",
    "                    continue\n",
    "                for percent in percents:\n",
    "                    data = {g:[] for g in group_categories[cat]}\n",
    "                    for summarizer in summarizers:\n",
    "                        if summarizer in ['matchsum', 'presumm'] and percent != percents[0]:\n",
    "                            continue\n",
    "                        elif summarizer == 'azure' and cat != 'gender':\n",
    "                            continue\n",
    "                        elif summarizer == 'gpt3' and cat != 'gender' and (cat != 'religion' or percent > 0.5):\n",
    "                            continue\n",
    "                        elif summarizer in ['matchsum', 'presumm']:\n",
    "                            path = f'../data/synthetic_data/analysis/{summarizer}/{r:.2f}_{g1}_{1-r:.2f}_{g2}/association_scores_{pos_filter}_noun.csv'\n",
    "                        else:\n",
    "                            path = f'../data/synthetic_data/analysis/{summarizer}/{percent}_{r:.2f}_{g1}_{1-r:.2f}_{g2}/association_scores_{pos_filter}_noun.csv'\n",
    "                        \n",
    "                        diffs, cis = get_group_diffs(path, group_categories[cat])\n",
    "\n",
    "                        for i in range(len(group_categories[cat])):\n",
    "                            data[group_categories[cat][i]].append([summarizer, group_categories[cat][i], diffs[i], cis[i]])\n",
    "                \n",
    "                    for g in group_categories[cat]:\n",
    "                        df = pd.DataFrame(data[g], columns=['summarizer', 'group', 'similarity', 'ci95'])\n",
    "                        df.to_csv(f'../data/synthetic_data/plot_data/multigroup/{percent}_{r:.2f}_{g1}_{1-r:.2f}_{g2}_summaries_{g}.dat', sep=' ', index=None)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.7 ('summarization')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "fb9173d43df76b287d1b53052eec4ff84d0ee52790be7057998c9269beecf529"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
