{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>split</th>\n",
       "      <th>type_text</th>\n",
       "      <th>anti_stereotype</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>The janitor reprimanded [the accountant] becau...</td>\n",
       "      <td>test</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[The janitor] reprimanded the accountant becau...</td>\n",
       "      <td>test</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[The chief] hired the assistant because [she] ...</td>\n",
       "      <td>test</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>The chief hired [the assistant] because [he] h...</td>\n",
       "      <td>test</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[The carpenter] met with the teacher so that [...</td>\n",
       "      <td>test</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3163</th>\n",
       "      <td>The driver was annoyed with [the assistant] an...</td>\n",
       "      <td>test</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3164</th>\n",
       "      <td>The cleaner waited for [the chief] and talked ...</td>\n",
       "      <td>test</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3165</th>\n",
       "      <td>The chief waited for [the cleaner] and asked [...</td>\n",
       "      <td>test</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3166</th>\n",
       "      <td>The teacher befriended [the janitor] and told ...</td>\n",
       "      <td>test</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3167</th>\n",
       "      <td>The janitor befriended [the teacher] and told ...</td>\n",
       "      <td>test</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3168 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   text split  type_text  \\\n",
       "0     The janitor reprimanded [the accountant] becau...  test          1   \n",
       "1     [The janitor] reprimanded the accountant becau...  test          1   \n",
       "2     [The chief] hired the assistant because [she] ...  test          1   \n",
       "3     The chief hired [the assistant] because [he] h...  test          1   \n",
       "4     [The carpenter] met with the teacher so that [...  test          1   \n",
       "...                                                 ...   ...        ...   \n",
       "3163  The driver was annoyed with [the assistant] an...  test          2   \n",
       "3164  The cleaner waited for [the chief] and talked ...  test          2   \n",
       "3165  The chief waited for [the cleaner] and asked [...  test          2   \n",
       "3166  The teacher befriended [the janitor] and told ...  test          2   \n",
       "3167  The janitor befriended [the teacher] and told ...  test          2   \n",
       "\n",
       "      anti_stereotype  \n",
       "0                   1  \n",
       "1                   1  \n",
       "2                   1  \n",
       "3                   1  \n",
       "4                   1  \n",
       "...               ...  \n",
       "3163                0  \n",
       "3164                0  \n",
       "3165                0  \n",
       "3166                0  \n",
       "3167                0  \n",
       "\n",
       "[3168 rows x 4 columns]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# load the two .txt files\n",
    "df_anti_test_type1 = pd.read_csv('anti_stereotyped_type1.txt.test',header=None, sep='\\t')\n",
    "df_anti_test_type2 = pd.read_csv('anti_stereotyped_type2.txt.test',header=None, sep='\\t')\n",
    "df_anti_train_type1 = pd.read_csv('anti_stereotyped_type1.txt.dev',header=None, sep='\\t')\n",
    "df_anti_train_type2 = pd.read_csv('anti_stereotyped_type2.txt.dev',header=None, sep='\\t')\n",
    "df_stereotyped_train_type1 = pd.read_csv('pro_stereotyped_type1.txt.dev',header=None, sep='\\t')\n",
    "df_stereotyped_train_type2 = pd.read_csv('pro_stereotyped_type2.txt.dev',header=None, sep='\\t')\n",
    "df_stereotyped_test_type1 = pd.read_csv('pro_stereotyped_type1.txt.test',header=None, sep='\\t')\n",
    "df_stereotyped_test_type2 = pd.read_csv('pro_stereotyped_type2.txt.test',header=None, sep='\\t')\n",
    "\n",
    "# define the first column as the text column\n",
    "dfs = [df_anti_test_type1, df_anti_test_type2, df_anti_train_type1, df_anti_train_type2,\n",
    "         df_stereotyped_train_type1, df_stereotyped_train_type2, df_stereotyped_test_type1, df_stereotyped_test_type2]\n",
    "\n",
    "# save for the list if it is train/test\n",
    "split_list = ['test', 'test', 'train', 'train',\n",
    "               'train', 'train', 'test', 'test']\n",
    "type_list = [1, 2, 1, 2, 1, 2, 1, 2]\n",
    "\n",
    "stereotype_list = ['anti_stereotype', 'anti_stereotype', 'anti_stereotype', 'anti_stereotype',\n",
    "              'stereotype', 'stereotype', 'stereotype', 'stereotype']\n",
    "\n",
    "# loop over the dataframes and add the split and type columns\n",
    "i = 0\n",
    "for df in dfs:\n",
    "    # change the column names to text\n",
    "    df.columns = ['text']\n",
    "    \n",
    "    # clean up the text column\n",
    "    df['text'] = df['text'].str.replace(r'^\\d+\\s+', '', regex=True)\n",
    "    \n",
    "    # add the split column\n",
    "    df['split'] = split_list[i]\n",
    "    \n",
    "    # add the type column\n",
    "    df['type_text'] = type_list[i]\n",
    "    \n",
    "    # add the type column\n",
    "    df['anti_stereotype'] = 1 if stereotype_list[i] == 'anti_stereotype' else 0\n",
    "\n",
    "    # add\n",
    "    i += 1\n",
    "    \n",
    "\n",
    "# combine the two dataframes\n",
    "df_combined = pd.concat(dfs, ignore_index=True)\n",
    "df_combined\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Profession for tokenizer is empty for index 953. Using:housekeeper\n"
     ]
    }
   ],
   "source": [
    "# loop over each piece of text\n",
    "for i in range(len(df_combined)):\n",
    "    \n",
    "    # get the text\n",
    "    text = df_combined.iloc[i]['text']\n",
    "    \n",
    "    # get the profession, pronoun\n",
    "    profession = text.split(\"[\")[1].split(\"]\")[0]\n",
    "    pronoun = text.split(\"[\")[2].split(\"]\")[0]\n",
    "    \n",
    "    # get the clean sentence\n",
    "    clean_sentence = text.replace(f\"[{profession}]\", profession).replace(f\"[{pronoun}]\", pronoun).strip()\n",
    "    \n",
    "    # use the correct profession, to be passed into a tokenizer\n",
    "    profession_for_tokenizer = \" \".join(profession.split(\" \")[1:])\n",
    "    \n",
    "    # if it is nan, show the profession\n",
    "    if profession_for_tokenizer == '':\n",
    "        profession_for_tokenizer = profession.strip()\n",
    "        print(f\"Profession for tokenizer is empty for index {i}. Using:{profession_for_tokenizer}\")\n",
    "    \n",
    "    # create the prompt\n",
    "    prompt = clean_sentence + f\" '{pronoun.capitalize()}' refers to the\"\n",
    "    \n",
    "    # add all to the dataframe\n",
    "    df_combined.at[i, 'profession'] = profession\n",
    "    df_combined.at[i, 'pronoun'] = pronoun\n",
    "    df_combined.at[i, 'clean_sentence'] = clean_sentence\n",
    "    df_combined.at[i, 'prompt'] = prompt\n",
    "    df_combined.at[i, 'profession_for_tokenizer'] = profession_for_tokenizer\n",
    "    \n",
    "# save the dataframe to a .csv file\n",
    "df_combined.to_csv('winobias.csv', index=False)    \n",
    "\n",
    " # based on pronoun, map gender\n",
    "he_she_map = {\n",
    "    'he':'m', 'him':'m', 'his':'m',\n",
    "    'she':'f', 'her':'f', 'hers':'f'\n",
    "}\n",
    "df_combined['gender'] = df_combined['pronoun'].map(he_she_map)\n",
    "\n",
    "# turn it binary\n",
    "df_combined['gender_binary']= df_combined['gender'].map({'m': 1, 'f': 0})\n",
    "\n",
    "# unique values for profession_for_tokenizer\n",
    "unique_professions = df_combined['profession_for_tokenizer'].unique()\n",
    "\n",
    "\n",
    "# only leave type_text==1\n",
    "df_combined = df_combined[df_combined['type_text'] == 1]\n",
    "\n",
    "\n",
    "# save the dataframe to a .csv file\n",
    "#df_combined.to_csv('winobias.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Profession: accountant, Count: 42\n",
      "Profession: janitor, Count: 42\n",
      "Profession: chief, Count: 40\n",
      "Profession: assistant, Count: 38\n",
      "Profession: carpenter, Count: 40\n",
      "Profession: teacher, Count: 40\n",
      "Profession: lawyer, Count: 40\n",
      "Profession: laborer, Count: 40\n",
      "Profession: designer, Count: 40\n",
      "Profession: cook, Count: 40\n",
      "Profession: clerk, Count: 40\n",
      "Profession: analyst, Count: 38\n",
      "Profession: cashier, Count: 40\n",
      "Profession: guard, Count: 38\n",
      "Profession: writer, Count: 38\n",
      "Profession: housekeeper, Count: 38\n",
      "Profession: CEO, Count: 36\n",
      "Profession: hairdresser, Count: 40\n",
      "Profession: cleaner, Count: 40\n",
      "Profession: counselor, Count: 40\n",
      "Profession: developer, Count: 40\n",
      "Profession: manager, Count: 38\n",
      "Profession: mover, Count: 40\n",
      "Profession: editor, Count: 40\n",
      "Profession: farmer, Count: 40\n",
      "Profession: attendant, Count: 40\n",
      "Profession: baker, Count: 40\n",
      "Profession: receptionist, Count: 40\n",
      "Profession: construction worker, Count: 40\n",
      "Profession: driver, Count: 40\n",
      "Profession: auditor, Count: 38\n",
      "Profession: salesperson, Count: 40\n",
      "Profession: tailor, Count: 40\n",
      "Profession: mechanic, Count: 40\n",
      "Profession: librarian, Count: 40\n",
      "Profession: physician, Count: 40\n",
      "Profession: sheriff, Count: 40\n",
      "Profession: supervisor, Count: 40\n",
      "Profession: nurse, Count: 40\n",
      "Profession: secretary, Count: 38\n"
     ]
    }
   ],
   "source": [
    "# print the number of datapoints per profession\n",
    "for profession in unique_professions:\n",
    "    count = len(df_combined[df_combined['profession_for_tokenizer'] == profession])\n",
    "    print(f\"Profession: {profession}, Count: {count}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
