{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "from collections import Counter\n",
    "from calibrated_marked_words import calibrated_marked_words as get_marked_words_occ\n",
    "from og_marked_words import marked_words as get_marked_words_og\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_inferred_gender = pd.read_csv('data/gpt-4o-mini/generated_personas_occupation_inferred_gender_gpt-4o-mini-2024-07-18_100_12-03-2024, 15:29:10.csv')\n",
    "\n",
    "df_gender = pd.read_csv('data/gpt-4o-mini/generated_personas_occupation_from_winogender_gender_gpt-4o-mini-2024-07-18_100_11-11-2024, 12:02:49.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "swe_inferred_gender = df_inferred_gender[df_inferred_gender['occupation'] == 'software engineer']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pprint(dic, computation = False):\n",
    "    full_list = []\n",
    "    if len(dic) == 2:\n",
    "        computation = dic[1]\n",
    "        dic = dic[0]\n",
    "    else:\n",
    "        computation = None\n",
    "    for word in sorted(dic,key=lambda x: x[1],reverse=True):\n",
    "        full_list.append(word[0])\n",
    "    if computation:\n",
    "        return full_list, computation\n",
    "    else:\n",
    "        return full_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = pd.read_csv('names_with_dem.csv')\n",
    "\n",
    "first_names = list()\n",
    "last_names = list()\n",
    "\n",
    "for name in names['First Name'].unique():\n",
    "    if type(name) == str:\n",
    "        first_names.append(name.lower())\n",
    "for name in names['Last Name'].unique():\n",
    "    if type(name) == str:\n",
    "        last_names.append(name.lower())\n",
    "\n",
    "lower_names = set(first_names + last_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_marked_word_og_and_calibrated(df, og_computation=False, inferred_gender=False, return_computation = True):\n",
    "\n",
    "    if inferred_gender:\n",
    "        gender_col = 'inferred_gender'\n",
    "    else:\n",
    "        gender_col = 'gender'\n",
    "\n",
    "    dv3_mw = dict()\n",
    "    dv3_mw_counts = dict()\n",
    "    dv3_mw_names = dict()\n",
    "    dv3_mw_occ = dict()\n",
    "    dv3_mw_occ_counts = dict()\n",
    "    dv3_mw_occ_names = dict()\n",
    "    dv3_mw_by_occ = dict()\n",
    "    dv3_mw_by_occ_counts = dict()\n",
    "    dv3_mw_by_occ_names = dict()\n",
    "    remove_names = False\n",
    "    for occupation in ['software engineer']:\n",
    "        dv3_mw_by_occ[occupation] = dict()\n",
    "        dv3_mw_by_occ_counts[occupation] = dict()\n",
    "        dv3_mw_by_occ_names[occupation] = dict()\n",
    "        \n",
    "        for race in df[gender_col].unique():\n",
    "            if og_computation:\n",
    "                outs = pprint(get_marked_words_og(df, [race], [gender_col],['M'], occupation)) # might need to remove occupation\n",
    "            else:\n",
    "                outs = pprint(get_marked_words_occ(df, [race], [gender_col],['M'], occupation))\n",
    "            new_outs = list()\n",
    "            curr_names = list()\n",
    "            for word in outs:\n",
    "                if remove_names and (word in lower_names or word[:-1] in lower_names):\n",
    "                    curr_names.append(word)\n",
    "                else:\n",
    "                    new_outs.append(word)\n",
    "            if race in dv3_mw:\n",
    "                dv3_mw[race].append([new_outs])\n",
    "                dv3_mw_names[race].append([[curr_names]])\n",
    "                if race in dv3_mw_by_occ[occupation]:\n",
    "                    dv3_mw_by_occ[occupation][race].append(new_outs)\n",
    "                    dv3_mw_by_occ_names[occupation][race].append(curr_names)\n",
    "                else:\n",
    "                    dv3_mw_by_occ[occupation][race] = new_outs\n",
    "                    dv3_mw_by_occ_names[occupation][race] = curr_names\n",
    "                if occupation in dv3_mw_occ[race]:\n",
    "                    dv3_mw_occ[race][occupation].append(new_outs)\n",
    "                    dv3_mw_occ_names[race][occupation].append(curr_names)\n",
    "                else:\n",
    "                    dv3_mw_occ[race][occupation] = new_outs\n",
    "                    dv3_mw_occ_names[race][occupation] = curr_names\n",
    "            else:\n",
    "                dv3_mw[race] = [[new_outs]]\n",
    "                dv3_mw_names[race] = [[curr_names]]\n",
    "                dv3_mw_by_occ[occupation][race] = new_outs\n",
    "                dv3_mw_by_occ_names[occupation][race] = curr_names\n",
    "                dv3_mw_occ[race] = {occupation: new_outs}\n",
    "                dv3_mw_occ_names[race] = {occupation: curr_names}\n",
    "        temps = []\n",
    "        temps_names = []\n",
    "        for race in df[gender_col].unique():        \n",
    "            if og_computation:\n",
    "                temp = pprint(get_marked_words_og(df, ['M'], [gender_col],[race], occupation))\n",
    "            else:\n",
    "                temp = pprint(get_marked_words_occ(df, ['M'], [gender_col],[race], occupation))\n",
    "            new_temp = list()\n",
    "            curr_names = list()\n",
    "            for word in temp:\n",
    "                if remove_names and (word in lower_names or word[:-1] in lower_names):\n",
    "                    curr_names.append(word)\n",
    "                else:\n",
    "                    new_temp.append(word)\n",
    "            temps.extend(new_temp)\n",
    "            temps_names.extend(curr_names)\n",
    "\n",
    "        seen = Counter(temps).most_common()\n",
    "        seen_names = Counter(temps_names).most_common()\n",
    "        num_seen = len(df[gender_col].unique()) - 1\n",
    "        m_words = [w for w, c in seen if c == num_seen]\n",
    "        m_words_names = [w for w, c in seen_names if c == num_seen]\n",
    "        if 'M' in dv3_mw:\n",
    "            dv3_mw['M'].append(m_words)\n",
    "            dv3_mw_names['M'].append(m_words_names)\n",
    "        else:\n",
    "            dv3_mw['M'] = m_words\n",
    "            dv3_mw_names['M'] = m_words_names\n",
    "        dv3_mw_by_occ[occupation]['M'] = m_words\n",
    "        dv3_mw_by_occ_names[occupation]['M'] = m_words_names\n",
    "\n",
    "    return dv3_mw, dv3_mw_counts, dv3_mw_names, dv3_mw_by_occ, dv3_mw_by_occ_counts, dv3_mw_by_occ_names, dv3_mw_occ, dv3_mw_occ_counts, dv3_mw_occ_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "his, -27.385\n",
      "he, -24.110\n",
      "him, -10.346\n",
      "david, -10.068\n",
      "james, -6.627\n",
      "alex, -6.531\n",
      "michael, -5.932\n",
      "daniel, -5.879\n",
      "himself, -5.144\n",
      "jonathan, -5.021\n",
      "her, 31.804\n",
      "she, 23.643\n",
      "maya, 16.410\n",
      "women, 13.315\n",
      "dr, 8.446\n",
      "sarah, 7.884\n",
      "girls, 6.113\n",
      "female, 5.877\n",
      "stem, 5.291\n",
      "in, 5.136\n",
      "and, 0.000\n",
      "in, 0.000\n",
      "a, 0.000\n",
      ", 0.000\n",
      "to, 0.000\n",
      "the, 0.000\n",
      "of, 0.000\n",
      "for, 0.000\n",
      "tech, 0.000\n",
      "her, 0.000\n",
      "and, 0.000\n",
      "in, 0.000\n",
      "a, 0.000\n",
      ", 0.000\n",
      "to, 0.000\n",
      "the, 0.000\n",
      "of, 0.000\n",
      "for, 0.000\n",
      "tech, 0.000\n",
      "her, 0.000\n",
      "his, -27.546\n",
      "he, -24.252\n",
      "him, -10.441\n",
      "david, -10.128\n",
      "james, -6.666\n",
      "thompson, -6.569\n",
      "michael, -5.967\n",
      "daniel, -5.914\n",
      "himself, -5.174\n",
      "jonathan, -5.051\n",
      "their, 23.963\n",
      "they, 21.101\n",
      "alex, 13.716\n",
      "nonbinary, 13.045\n",
      "identity, 11.020\n",
      "morgan, 9.003\n",
      "inclusivity, 8.956\n",
      "lgbtq, 8.684\n",
      "inclusive, 7.740\n",
      "are, 7.537\n",
      "her, -31.804\n",
      "she, -23.643\n",
      "maya, -16.410\n",
      "women, -13.315\n",
      "dr, -8.446\n",
      "sarah, -7.884\n",
      "girls, -6.113\n",
      "female, -5.877\n",
      "stem, -5.291\n",
      "in, -5.136\n",
      "his, 27.385\n",
      "he, 24.110\n",
      "him, 10.346\n",
      "david, 10.068\n",
      "james, 6.627\n",
      "alex, 6.531\n",
      "michael, 5.932\n",
      "daniel, 5.879\n",
      "himself, 5.144\n",
      "jonathan, 5.021\n",
      "and, 0.000\n",
      "in, 0.000\n",
      "a, 0.000\n",
      ", 0.000\n",
      "to, 0.000\n",
      "the, 0.000\n",
      "of, 0.000\n",
      "for, 0.000\n",
      "tech, 0.000\n",
      "her, 0.000\n",
      "and, 0.000\n",
      "in, 0.000\n",
      "a, 0.000\n",
      ", 0.000\n",
      "to, 0.000\n",
      "the, 0.000\n",
      "of, 0.000\n",
      "for, 0.000\n",
      "tech, 0.000\n",
      "her, 0.000\n",
      "their, -23.963\n",
      "they, -21.101\n",
      "alex, -13.716\n",
      "nonbinary, -13.045\n",
      "identity, -11.020\n",
      "morgan, -9.003\n",
      "inclusivity, -8.956\n",
      "lgbtq, -8.684\n",
      "inclusive, -7.740\n",
      "are, -7.537\n",
      "his, 27.546\n",
      "he, 24.252\n",
      "him, 10.441\n",
      "david, 10.128\n",
      "james, 6.666\n",
      "thompson, 6.569\n",
      "michael, 5.967\n",
      "daniel, 5.914\n",
      "himself, 5.174\n",
      "jonathan, 5.051\n"
     ]
    }
   ],
   "source": [
    "dv3_mw_gender_og, dv3_mw_counts_gender_og, dv3_mw_names_gender_og, dv3_mw_by_occ_gender_og, dv3_mw_by_occ_counts_gender_og, dv3_mw_by_occ_names_gender_og, dv3_mw_occ_gender_og, dv3_mw_occ_counts_gender_og, dv3_mw_occ_names_gender_og = compute_marked_word_og_and_calibrated(df_gender[df_gender['occupation'] == 'software engineer'], og_computation=True)\n",
    "\n",
    "dv3_mw_gender_cal, dv3_mw_counts_gender_cal, dv3_mw_names_gender_cal, dv3_mw_by_occ_gender_cal, dv3_mw_by_occ_counts_gender_cal, dv3_mw_by_occ_names_gender_cal, dv3_mw_occ_gender_cal, dv3_mw_occ_counts_gender_cal, dv3_mw_occ_names_gender_cal = compute_marked_word_og_and_calibrated(df_gender[df_gender['occupation'] == 'software engineer'], og_computation=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "not in calibrated\n",
      "['back', 'boy', 'children', 'one', 'being', 'i', 'been', 'we', 'state', 'were', 'value', 'be', 'feel', 'felt', 'should', 'our', 'expression']\n",
      "not in og\n",
      "['projects', 'github', 'online', 'values', 'keen', 'technologies', 'lifestyle', 'detailoriented', 'enjoys', 'analytical', 'underprivileged', 'struggles', 'collaborative', 'honed', 'burgeoning', 'boundaries', 'management', 'innovatech', 'cycling', 'carter', 'spends', 'startup', 'kubernetes', 'streamlined', 'knack', 'outdoor', 'contributes', 'repositories', 'avid', 'attracting', 'streamline', 'max', 'clean', 'aspirations', 'peers', 'frameworks', 'tackling', 'finds', 'manageable', 'problems', 'clients', 'mobile', 'fitness', 'jason', 'designer', 'reviews', 'immersed', 'best', 'adaptable', 'interned', 'stay', 'healthy', 'courses', 'pays', 'hours', 'andrews', 'tools', 'enthusiast', 'inc', 'graduating', 'takes', 'years', 'regularly', 'propelled', 'methodical', 'prominence', 'reynolds', 'marked', 'jameson', 'blockchain', 'jonathans', 'maintainable', 'blogs', 'updated', 'likes', 'databases', 'entrepreneurial', 'java', 'developer', 'push', 'jamess', 'outdoorsman', 'nate', 'reed', 'flourished', 'jim', 'podcasts', 'learner', 'player', 'simple', 'processes', 'adventures', 'team', 'superiors', 'activities', 'codecraft', 'continuous', 'diverse', 'aimed', 'workshops', 'workplace', 'communities', 'supportive', 'empowerment', 'equitable', 'passionate', 'underserved', 'innovator', 'trailblazing', 'generations', 'berkeley', 'pursue', 'conferences', 'focused', 'everyone', 'support', 'resilience', 'chens', 'biases', 'accessibility', 'navigating', 'bias', 'proving', 'volunteering', 'fostering', 'gap', 'aidriven', 'luna', 'countless', 'contributions', 'laude', 'cum', 'imposter', 'promote', 'networking', 'syndrome', 'educators', 'publications', 'focuses', 'resources', 'algorithms', 'featured', 'institute', 'empowered', 'recognized', 'confidence', 'summa', 'emilys', 'claras', 'mental', 'stereotypes', 'continue', 'collaborates', 'efforts', 'ellie', 'aisha', 'panels', 'academic', 'industry', 'massachusetts', 'extends', 'faced', 'workforce', 'rescue', 'perseverance', 'thousands', 'others', 'equity', 'traditionally', 'recognition', 'innovators', 'tran', 'underrepresentation', 'nguyen', 'priya', 'doctorate', 'immigrants', 'advancing', 'health', 'disparity', 'workplaces', 'prowess', 'tensorflow', 'earning', 'future', 'stem', 'pursuing', 'empowerment', 'inspire', 'hiring', 'nonprofit', 'passionate', 'prioritized', 'vuejs', 'storytelling', 'openminded', 'talks', 'organization', 'engage', 'teenage', 'focused', 'workplaces', 'rails', 'within', 'focuses', 'societal', 'multicultural', 'empathy', 'championed', 'discussions', 'environment', 'aimed', 'specialize', 'blending', 'painting', 'fields', 'culture', 'uxui', 'became', 'pioneering', 'artistic', 'ecofriendly', 'empowered', 'addition', 'uiux', 'taylors', 'usercentered', 'blossomed', 'influenced', 'express', 'stereotypes', 'outspoken', 'activism', 'disabilities', 'educate', 'themes', 'beacon', 'faced', 'creativity', 'panels', 'frontend', 'broader', 'morgans', 'proving', 'coastal', 'maledominated', 'faces', 'uplift', 'generations', 'vibrant', 'urban', 'resonate', 'wellbeing', 'user', 'break', 'quinn', 'promotes', 'oneself', 'tapestry', 'authentically', 'transcends', 'embraces', 'galleries', 'shaped', 'discuss', 'worlds', 'align', 'casey', 'particularly', 'pave', 'speculative', 'usable', 'background', 'establish', 'installations', 'practicing', 'css', 'listener', 'related']\n",
      "Gender & Marked Words\\\\\n",
      "\\hline\n",
      "M & back, boy, children\n",
      "F & one, being, i, been\n",
      "N & we, state, were, value, be, feel, felt, should, our, expression\n",
      "Gender & Calibrated Marked Words \\\\\n",
      "\\hline\n",
      "M & projects, github, online, values, keen, technologies, lifestyle, detailoriented, enjoys, analytical, underprivileged, struggles, collaborative, honed, burgeoning, boundaries, management, innovatech, cycling, carter, spends, startup, kubernetes, streamlined, knack, outdoor, contributes, repositories, avid, attracting, streamline, max, clean, aspirations, peers, frameworks, tackling, finds, manageable, problems, clients, mobile, fitness, jason, designer, reviews, immersed, best, adaptable, interned, stay, healthy, courses, pays, hours, andrews, tools, enthusiast, inc, graduating, takes, years, regularly, propelled, methodical, prominence, reynolds, marked, jameson, blockchain, jonathans, maintainable, blogs, updated, likes, databases, entrepreneurial, java, developer, push, jamess, outdoorsman, nate, reed, flourished, jim, podcasts, learner, player, simple, processes, adventures, team, superiors, activities, codecraft, continuous\\\\\n",
      "F & diverse, aimed, workshops, workplace, communities, supportive, empowerment, equitable, passionate, underserved, innovator, trailblazing, generations, berkeley, pursue, conferences, focused, everyone, support, resilience, chens, biases, accessibility, navigating, bias, proving, volunteering, fostering, gap, aidriven, luna, countless, contributions, laude, cum, imposter, promote, networking, syndrome, educators, publications, focuses, resources, algorithms, featured, institute, empowered, recognized, confidence, summa, emilys, claras, mental, stereotypes, continue, collaborates, efforts, ellie, aisha, panels, academic, industry, massachusetts, extends, faced, workforce, rescue, perseverance, thousands, others, equity, traditionally, recognition, innovators, tran, underrepresentation, nguyen, priya, doctorate, immigrants, advancing, health, disparity, workplaces, prowess, tensorflow, earning, future\\\\\n",
      "N & stem, pursuing, empowerment, inspire, hiring, nonprofit, passionate, prioritized, vuejs, storytelling, openminded, talks, organization, engage, teenage, focused, workplaces, rails, within, focuses, societal, multicultural, empathy, championed, discussions, environment, aimed, specialize, blending, painting, fields, culture, uxui, became, pioneering, artistic, ecofriendly, empowered, addition, uiux, taylors, usercentered, blossomed, influenced, express, stereotypes, outspoken, activism, disabilities, educate, themes, beacon, faced, creativity, panels, frontend, broader, morgans, proving, coastal, maledominated, faces, uplift, generations, vibrant, urban, resonate, wellbeing, user, break, quinn, promotes, oneself, tapestry, authentically, transcends, embraces, galleries, shaped, discuss, worlds, align, casey, particularly, pave, speculative, usable, background, establish, installations, practicing, css, listener, related\\\\\n"
     ]
    }
   ],
   "source": [
    "not_in_calibrated = list()\n",
    "not_in_og = list()\n",
    "not_in_calibrated_by_gender = {'M': [], 'F': [], 'N': []}\n",
    "not_in_og_by_gender = {'M': [], 'F': [], 'N': []}\n",
    "for g in ['M', 'F', 'N']:\n",
    "    str_to_print = f'{g} & '\n",
    "    for word in dv3_mw_by_occ_gender_og['software engineer'][g]:\n",
    "        if word not in dv3_mw_by_occ_gender_cal['software engineer'][g]:\n",
    "            not_in_calibrated.append(word)\n",
    "            not_in_calibrated_by_gender[g].append(word)\n",
    "    for word in dv3_mw_by_occ_gender_cal['software engineer'][g]:\n",
    "        if word not in dv3_mw_by_occ_gender_og['software engineer'][g]:\n",
    "            not_in_og.append(word)\n",
    "            not_in_og_by_gender[g].append(word)\n",
    "\n",
    "print('not in calibrated')\n",
    "print(not_in_calibrated)\n",
    "\n",
    "print('not in og')\n",
    "print(not_in_og)\n",
    "\n",
    "# Marked Words displayed are the words identified by Cheng et al.'s Marked Words method and not by our Calibrated Marked Words method.\n",
    "print('Gender & Marked Words\\\\\\\\')\n",
    "print('\\\\hline')\n",
    "for g in ['M', 'F', 'N']:\n",
    "    str_to_print = ''\n",
    "    for word in not_in_calibrated_by_gender[g]:\n",
    "        str_to_print += f'{word}, '\n",
    "    print(f'{g} & {str_to_print[:-2]}')\n",
    "\n",
    "# Calibrated Marked Words displayed are the words identified by our Calibrated Marked Words method and not by Cheng et al.'s Marked Words method.\n",
    "print('Gender & Calibrated Marked Words \\\\\\\\')\n",
    "print('\\\\hline')\n",
    "for g in ['M', 'F', 'N']:\n",
    "    str_to_print = ''\n",
    "    for word in not_in_og_by_gender[g]:\n",
    "        str_to_print += f'{word}, '\n",
    "    print(f'{g} & {str_to_print[:-2]}\\\\\\\\')\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
