{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# [Double-Hard Debias: Tailoring Word Embeddings for Gender Bias Mitigation](https://arxiv.org/abs/2005.00965)\n",
    "\n",
    "For more detailed explanations, please refer to the paper."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load original embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "import codecs, os, json, operator, pickle\n",
    "from random import shuffle\n",
    "import numpy as np\n",
    "from numpy import linalg as LA\n",
    "import scipy\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "322636 (322636, 300) 322636\n"
     ]
    }
   ],
   "source": [
    "def load_glove(path):\n",
    "    with open(path) as f:\n",
    "        lines = f.readlines()\n",
    "    \n",
    "    wv = []\n",
    "    vocab = []\n",
    "    for line in lines:\n",
    "        tokens = line.strip().split(\" \")\n",
    "        assert len(tokens) == 301\n",
    "        vocab.append(tokens[0])\n",
    "        wv.append([float(elem) for elem in tokens[1:]])\n",
    "    w2i = {w: i for i, w in enumerate(vocab)}\n",
    "    wv = np.array(wv).astype(float)\n",
    "    print(len(vocab), wv.shape, len(w2i))\n",
    "    \n",
    "    return wv, w2i, vocab\n",
    "\n",
    "wv, w2i, vocab = load_glove('./data/vectors.txt')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Restrict Vocabulary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50000/50000 [00:00<00:00, 369472.54it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "size of vocabulary: 47628\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from tqdm import tqdm\n",
    "from utils import limit_vocab\n",
    "\n",
    "\n",
    "gender_specific = []\n",
    "with open('./data/male_word_file.txt') as f:\n",
    "    for l in f:\n",
    "        gender_specific.append(l.strip())\n",
    "with open('./data/female_word_file.txt') as f:\n",
    "    for l in f:\n",
    "        gender_specific.append(l.strip())\n",
    "\n",
    "with codecs.open('./data/gender_specific_full.json') as f:\n",
    "    gender_specific.extend(json.load(f))\n",
    "\n",
    "definitional_pairs = [['she', 'he'], ['herself', 'himself'], ['her', 'his'], ['daughter', 'son'], \n",
    "                      ['girl', 'boy'], ['mother', 'father'], ['woman', 'man'], ['mary', 'john'], \n",
    "                      ['gal', 'guy'], ['female', 'male']]\n",
    "definitional_words = []\n",
    "for pair in definitional_pairs:\n",
    "    for word in pair:\n",
    "        definitional_words.append(word)\n",
    "\n",
    "exclude_words = gender_specific\n",
    "vocab_limit, wv_limit, w2i_limit = limit_vocab(wv, w2i, vocab, exclude = exclude_words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compute original bias"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "he_embed = wv[w2i['he'], :]\n",
    "she_embed = wv[w2i['she'], :]\n",
    "\n",
    "def simi(a, b):\n",
    "    return 1-scipy.spatial.distance.cosine(a, b)\n",
    "\n",
    "def compute_bias_by_projection(wv, w2i, vocab):\n",
    "    d = {}\n",
    "    for w in vocab:\n",
    "        u = wv[w2i[w], :]\n",
    "        d[w] = simi(u, he_embed) - simi(u, she_embed)\n",
    "    return d\n",
    "\n",
    "gender_bias_bef = compute_bias_by_projection(wv_limit, w2i_limit, vocab_limit)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Debias"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "\n",
    "# get main PCA components\n",
    "def my_pca(wv):\n",
    "    wv_mean = np.mean(np.array(wv), axis=0)\n",
    "    wv_hat = np.zeros(wv.shape).astype(float)\n",
    "\n",
    "    for i in range(len(wv)):\n",
    "        wv_hat[i, :] = wv[i, :] - wv_mean\n",
    "\n",
    "    main_pca = PCA()\n",
    "    main_pca.fit(wv_hat)\n",
    "    \n",
    "    return main_pca\n",
    "\n",
    "main_pca = my_pca(wv)\n",
    "wv_mean = np.mean(np.array(wv), axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def hard_debias(wv, w2i, w2i_partial, vocab_partial, component_ids):\n",
    "    \n",
    "    D = []\n",
    "\n",
    "    for i in component_ids:\n",
    "        D.append(main_pca.components_[i])\n",
    "    \n",
    "    # get rid of frequency features\n",
    "    wv_f = np.zeros((len(vocab_partial), wv.shape[1])).astype(float)\n",
    "    \n",
    "    for i, w in enumerate(vocab_partial):\n",
    "        u = wv[w2i[w], :]\n",
    "        sub = np.zeros(u.shape).astype(float)\n",
    "        for d in D:\n",
    "            sub += np.dot(np.dot(np.transpose(d), u), d)\n",
    "        wv_f[w2i_partial[w], :] = wv[w2i[w], :] - sub - wv_mean\n",
    "        \n",
    "    # debias\n",
    "    gender_directions = list()\n",
    "    for gender_word_list in [definitional_pairs]:\n",
    "        gender_directions.append(doPCA(gender_word_list, wv_f, w2i_partial).components_[0])\n",
    "    \n",
    "    wv_debiased = np.zeros((len(vocab_partial), len(wv_f[0, :]))).astype(float)\n",
    "    for i, w in enumerate(vocab_partial):\n",
    "        u = wv_f[w2i_partial[w], :]\n",
    "        for gender_direction in gender_directions:\n",
    "            u = drop(u, gender_direction)\n",
    "            wv_debiased[w2i_partial[w], :] = u\n",
    "    \n",
    "    return wv_debiased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "def cluster_and_visualize(words, X, random_state, y_true, num=2):\n",
    "    \n",
    "    kmeans = KMeans(n_clusters=num, random_state=random_state).fit(X)\n",
    "    y_pred = kmeans.predict(X)\n",
    "    correct = [1 if item1 == item2 else 0 for (item1,item2) in zip(y_true, y_pred) ]\n",
    "    preci = max(sum(correct)/float(len(correct)), 1 - sum(correct)/float(len(correct)))\n",
    "    print('precision', preci)\n",
    "    \n",
    "    return kmeans, y_pred, X, preci"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "322636\n",
      "['managed', 'paola', 'subic', 'annabelle', 'dolly', 'skilful', 'lexi', 'engineer', 'trombonist', 'earrings', 'scored', 'rimes', 'draughtsman', 'histidine', 'gwyneth', 'eagles', 'brooch', 'allure', 'mortgaged', 'amelia', 'valenciennes', 'brightman', 'unita', 'sox', 'mcentire', 'ship', 'adriana', 'maud', 'angelica', 'janet', 'principles', 'okinawa', 'torpedoed', 'government', 'pearl', 'lillian', 'moonstone', 'atoll', 'redshirted', 'kayla', 'sonja', 'lana', 'danielle', 'bridget', 'marissa', 'funnels', 'berthing', 'office', 'dress', 'rookie', 'nikki', 'torino', 'mineralogy', 'fielder', 'casemate', 'topographical', 'naomi', 'contralto', 'lolita', 'province', 'raked', 'schooner', 'sensuality', 'morissette', 'played', 'commissar', 'elected', 'sacked', 'dionysius', 'returner', 'seconded', 'fluminense', 'karzai', 'quarterback', 'janie', 'dagmar', 'career', 'piacenza', 'xena', 'lenore', 'sylvia', 'robben', 'premier', 'kristina', 'govern', 'game', 'pageant', 'ascoli', 'tankers', 'dianne', 'ulithi', 'of', 'wrexham', 'charge', 'josie', 'pythagorean', 'ce', 'frigate', 'vatican', 'caliphate', 'cynthia', 'nandini', 'cricketers', 'glycol', 'goalscorer', 'town', 'henriette', 'esquimalt', 'postmaster', 'carrie', 'seagull', 'betsy', 'arezzo', 'chargers', 'adjutant', 'tackle', 'celia', 'infertile', 'nba', 'mechanics', 'anchorage', 'optics', 'majuro', 'cinta', 'shogun', 'müller', 'sultry', 'flyers', 'scarlets', 'slaps', 'yards', 'mms', 'mindy', 'electrodynamics', 'paige', 'warship', 'opposed', 'lili', 'worcestershire', 'royalist', 'colette', 'marilyn', 'senators', 'rhoda', 'bishoprics', 'bishopric', 'chantal', 'cleo', 'gina', 'female', 'sabrina', 'queenie', 'telemundo', 'empowerment', 'brentford', 'cece', 'amassed', 'bat', 'crusaders', 'berra', 'alouettes', 'outlaw', 'buffaloes', 'postpartum', 'rishon', 'amuro', 'khl', 'dench', 'natalya', 'dinamo', 'kathleen', 'tatiana', 'defections', 'mermaid', 'lulu', 'ivana', 'jodie', 'agrarian', 'pastorate', 'sandra', 'johann', 'secession', 'doña', 'nathalie', 'tina', 'ruled', 'vfl', 'mimi', 'pola', 'orioles', 'consulship', 'capes', 'naturalist', 'team', 'sussex', 'revolt', 'janissaries', 'isabelle', 'hearne', 'spengler', 'westpac', 'bretagne', 'bradford', 'izumi', 'appointment', 'daud', 'vespasian', 'wigan', 'brunette', 'becky', 'veolia', 'jessica', 'sara', 'refitted', 'dribbling', 'virtus', 'elena', 'patriots', 'karina', 'trawlers', 'guarani', 'replenishment', 'erica', 'rosalie', 'millicent', 'lega', 'nicolaus', 'failed', 'maxine', 'karen', 'roberta', 'tallulah', 'cardinal', 'pregnant', 'prefect', 'derry', 'positions', 'sacking', 'medea', 'marshal', 'legate', 'julie', 'minesweeping', 'primacy', 'leyte', 'beatriz', 'primeira', 'reds', 'fought', 'antiquary', 'multan', 'prizren', 'tarragona', 'joanna', 'stampeders', 'inventions', 'ipswich', 'jill', 'guam', 'emilie', 'myra', 'lands', 'yumi', 'ducati', 'pontificate', 'relativistic', 'lucrezia', 'aaliyah', 'pitchers', 'insolvent', 'michaela', 'demi', 'undisputed', 'wnba', 'wilma', 'vessel', 'nadh', 'infanticide', 'preaching', 'crusades', 'lavigne', 'son', 'apostolic', 'cassandra', 'shima', 'regarded', 'kelli', 'mermaids', 'injuries', 'monarchist', 'ursula', 'mechelen', 'prussian', 'janine', 'prebendary', 'xiii', 'gainsborough', 'afterwards', 'inés', 'lahore', 'kyoko', 'patti', 'valentina', 'comforting', 'defensive', 'giants', 'alejandra', 'receivers', 'drafted', 'points', 'scorer', 'gotthard', 'rcn', 'kickers', 'reich', 'trimester', 'eton', 'tearfully', 'zainab', 'petrus', 'fielded', 'hereford', 'debbie', 'burges', 'tvb', 'principality', 'kilda', 'christina', 'invented', 'reserves', 'fey', 'keri', 'bradman', 'templars', 'hilda', 'ada', 'factions', 'defender', 'emma', 'walleye', 'meera', 'leah', 'utada', 'camilla', 'avex', 'kilmore', 'waterline', 'supporters', 'sauron', 'militia', 'hannah', 'methodists', 'itf', 'offensive', 'ottawa', 'catalans', 'laila', 'setup', 'bethany', 'hume', 'vikings', 'gowns', 'divisione', 'natasha', 'valerie', 'bankrupt', 'priyanka', 'workington', 'favoured', 'crassus', 'joan', 'dahlia', 'wingate', 'sefer', 'rule', 'kashmir', 'emirs', 'beauty', 'mainz', 'inventor', 'prato', 'tanja', 'simone', 'ibn', 'ironclad', 'yasmin', 'truk', 'treviso', 'mcguinness', 'trish', 'barbarians', 'roofline', 'coadjutor', 'cpl', 'punts', 'buoys', 'phyllis', 'sappho', 'oprah', 'nurse', 'counties', 'rams', 'packers', 'reorganize', 'streep', 'ruckus', 'zara', 'pdl', 'establishing', 'appointing', 'carriers', 'doctrinal', 'cruise', 'varese', 'verona', 'usl', 'doreen', 'necklaces', 'doris', 'boers', 'jacqueline', 'juventus', 'doncaster', 'foal', 'nouméa', 'friars', 'playoff', 'mercenaries', 'philosophical', 'barbra', 'captaincy', 'hadassah', 'outlaws', 'helen', 'tribunes', 'cambridge', 'allegheny', 'carolyn', 'preacher', 'rome', 'imelda', 'bankrupted', 'nicole', 'tugboat', 'trialled', 'mosul', 'prolific', 'stourbridge', 'oshawa', 'gemma', 'whig', 'baghdad', 'breastfeeding', 'lavinia', 'players', 'squad', 'feisty', 'scherzinger', 'cavalry', 'callie', 'sally', 'fanny', 'fiona', 'felicia', 'svetlana', 'johnstown', 'confucius', 'steamer', 'loyalists', 'christening', 'eskimos', 'carrier', 'julia', 'pioneers', 'charlene', 'tamara', 'nl', 'unborn', 'lipstick', 'collingwood', 'aground', 'dijon', 'fouls', 'titleholder', 'diane', 'razed', 'lifeboats', 'surrey', 'preach', 'bremerhaven', 'bree', 'bartoli', 'signed', 'mawr', 'mccloskey', 'jelena', 'faints', 'monika', 'provincial', 'imogen', 'keeper', 'scharnhorst', 'annette', 'hegel', 'ovarian', 'grêmio', 'menlo', 'stella', 'frida', 'patented', 'phillies', 'ruth', 'anne', 'amethyst', 'npb', 'nhl', 'goalscoring', 'batgirl', 'mujeres', 'nontrivial', 'unload', 'braves', 'rayne', 'mme', 'submerged', 'promiscuous', 'minnelli', 'southgate', 'caliph', 'towed', 'steelers', 'antisubmarine', 'marlins', 'feminists', 'anchored', 'toluca', 'middlesex', 'reactivation', 'minogue', 'against', 'gossip', 'forfeited', 'moldavian', 'joseph', 'misl', 'cher', 'nina', 'corvettes', 'genevieve', 'leader', 'sia', 'krista', 'assistant', 'antiaircraft', 'petah', 'eredivisie', 'oxford', 'sandhya', 'virgen', 'tugs', 'ham', 'lauren', 'midfielder', 'puget', 'armies', 'sulla', 'sayyid', 'ousted', 'abigail', 'tây', 'extensible', 'nasl', 'minting', 'sophia', 'transactions', 'suffragist', 'secular', 'elle', 'mona', 'rachael', 'rosemary', 'gretchen', 'vivienne', 'franky', 'goals', 'singer', 'stern', 'victories', 'arlene', 'teams', 'olga', 'cincinnati', 'gymnast', 'redhead', 'catholicos', 'megan', 'maroons', 'bonn', 'adele', 'milly', 'amanda', 'tricia', 'pga', 'bette', 'reforms', 'edna', 'mistral', 'padova', 'assures', 'nadia', 'salome', 'cavan', 'sonia', 'jennifer', 'plano', 'ordained', 'fernanda', 'lily', 'henan', 'masts', 'dundalk', 'marianas', 'rookies', 'ekaterina', 'governorship', 'madge', 'louise', 'bc', 'campeonato', 'inna', 'katharine', 'joanne', 'logic', 'sienna', 'hildegard', 'sailed', 'nascar', 'tigers', 'secessionist', 'amina', 'breda', 'balliol', 'eva', 'pittsburgh', 'refit', 'county', 'pius', 'destroyers', 'germaine', 'contestant', 'recommissioned', 'duet', 'vessels', 'wilhelmina', 'rebecca', 'amy', 'rhinos', 'nottinghamshire', 'yost', 'heather', 'fiorentina', 'lauper', 'vardar', 'telenovela', 'mary', 'colonel', 'claire', 'bettina', 'berthed', 'ligase', 'katie', 'commanders', 'tatyana', 'mujer', 'nita', 'overhaul', 'newton', 'doctrines', 'justine', 'bombshell', 'kure', 'catching', 'john', 'belinda', 'sabres', 'moorings', 'brigitte', 'middlesbrough', 'football', 'primate', 'schoolmaster', 'judy', 'pippa', 'woman', 'surveyor', 'feminine', 'edie', 'stratus', 'sheila', 'conceded', 'geschichte', 'renata', 'escorted', 'handbag', 'marianne', 'promoted', 'polaris', 'erected', 'rec', 'rihanna', 'selina', 'cherokees', 'loaned', 'kitty', 'shafts', 'who', 'douai', 'post', 'glentoran', 'ursuline', 'striker', 'cassie', 'tenure', 'dolls', 'established', 'degeneres', 'transited', 'hitomi', 'melba', 'made', 'tabitha', 'polly', 'udinese', 'successive', 'alicia', 'suzy', 'dollhouse', 'fhm', 'linda', 'manus', 'denise', 'mechanic', 'tommaso', 'kana', 'soult', 'alisa', 'petite', 'dryden', 'reorganized', 'anchor', 'icebreakers', 'texans', 'maggie', 'admirals', 'diocese', 'pescara', 'generals', 'momo', 'clijsters', 'sasha', 'agricola', 'administrator', 'lucinda', 'oberst', 'lohan', 'averaging', 'nadezhda', 'leicestershire', 'ifa', 'locket', 'shandong', 'sasebo', 'jolie', 'arabs', 'frau', 'cookbooks', 'bea', 'repton', 'courtesan', 'ecclesiastic', 'femme', 'clement', 'club', 'pistons', 'dorothy', 'nfl', 'sligo', 'skirmish', 'methodius', 'sophie', 'molly', 'renee', 'linebackers', 'villa', 'therese', 'eleanor', 'imperium', 'scoring', 'rangers', 'brescia', 'franchises', 'christine', 'starboard', 'pregnancy', 'cricket', 'cmg', 'legionary', 'reba', 'sissy', 'kristine', 'chamberlain', 'katya', 'came', 'lucretia', 'jasmine', 'basel', 'forwards', 'shawl', 'tania', 'rushing', 'midfield', 'siena', 'anderlecht', 'nancy', 'elway', 'kara', 'astrid', 'whl', 'sack', 'dodgers', 'underway', 'mlb', 'musume', 'abbasid', 'hijab', 'serie', 'agnes', 'lingayen', 'honshū', 'harmonics', 'wilford', 'espiritu', 'connacht', 'pregnancies', 'edith', 'relied', 'regimental', 'offenses', 'troopship', 'elaine', 'mariah', 'rochdale', 'bridal', 'parton', 'viki', 'ciara', 'emmanuelle', 'paula', 'regius', 'klitschko', 'falcons', 'karin', 'ruler', 'infanta', 'homeport', 'lorena', 'uxbridge', 'seasons', 'bullpen', 'jesuit', 'sportsman', 'brandi', 'yaroslavl', 'selene', 'prelates', 'breakwater', 'dolphins', 'pankhurst', 'martine', 'pfc', 'glamorous', 'idioms', 'flirtatious', 'sassy', 'vivien', 'claudia', 'reclassified', 'janice', 'chanel', 'dictator', 'anna', 'hurling', 'lleida', 'betrothal', 'bianca', 'susanne', 'hattie', 'penalty', 'miriam', 'fk', 'accrington', 'swimwear', 'perfume', 'sexism', 'hazlitt', 'radiance', 'blouse', 'usfl', 'nell', 'ranks', 'megatron', 'maxie', 'gloucestershire', 'pacification', 'julianne', 'history', 'footballing', 'belén', 'amélie', 'azad', 'torpedoes', 'kittens', 'besiege', 'miscarriage', 'skirt', 'winger', 'chiefly', 'violetta', 'himself', 'macro', 'giggs', 'getafe', 'anita', 'clydesdale', 'sixtus', 'refresher', 'betty', 'baseball', 'susie', 'goal', 'shrewsbury', 'saxons', 'chiara', 'jaffna', 'lisa', 'saints', 'quiero', 'escort', 'corps', 'sought', 'mandy', 'roseanne', 'refloated', 'william', 'irvin', 'assassinated', 'manager', 'mcc', 'papacy', 'lena', 'michelle', 'headquarters', 'northampton', 'reorganizing', 'aquileia', 'bengals', 'sofía', 'judith', 'hobbes', 'gaul', 'voivode', 'scotties', 'sapphire', 'cilla', 'jana', 'punjab', 'quarterbacks', 'squads', 'daniela', 'presbyterians', 'his', 'nizam', 'petrova', 'battleships', 'llanelli', 'offense', 'indians', 'shortstop', 'usns', 'méliès', 'ironclads', 'maximus', 'luisa', 'matches', 'machiavelli', 'athanasius', 'father', 'transvaal', 'fatale', 'elisa', 'lincoln', 'kylie', 'carlow', 'mooring', 'preached', 'menstruation', 'boer', 'leaguer', 'paloma', 'palmeiras', 'pope', 'blonde', 'ashley', 'ground', 'céline', 'bernadette', 'replenished', 'vettel', 'ní', 'deactivation', 'ivanovic', 'eliza', 'sharapova', 'tanya', 'angelina', 'olympiakos', 'forecastle', 'platted', 'aida', 'elisabeth', 'stacey', 'detroit', 'distraught', 'recalled', 'jillian', 'swindon', 'league', 'huddersfield', 'bench', 'tug', 'catcher', 'baby', 'atlético', 'seduces', 'roxy', 'tt', 'flirts', 'formulated', 'opposing', 'diana', 'fedex', 'consul', 'salma', 'receptionist', 'holders', 'ioannis', 'teen', 'irl', 'daria', 'nishapur', 'najaf', 'augusta', 'candice', 'mahdi', 'salamanca', 'ellen', 'augustus', 'breasts', 'kristen', 'oftentimes', 'papal', 'erotic', 'lawman', 'angie', 'yulia', 'annie', 'canterbury', 'alumna', 'antigonus', 'grt', 'destroyer', 'gail', 'caroline', 'pamela', 'cathy', 'escorts', 'sadie', 'turrets', 'lizzie', 'qutb', 'monica', 'alyssa', 'interim', 'minor', 'mattie', 'supergirl', 'navratilova', 'raptors', 'putsch', 'fetus', 'melissa', 'abby', 'democracies', 'chrissie', 'bonnie', 'margaret', 'freemason', 'constable', 'cicero', 'layman', 'coco', 'sinking', 'settlers', 'rochester', 'nipple', 'completions', 'inquisition', 'yolanda', 'billie', 'connie', 'fortify', 'defensively', 'rotherham', 'vogue', 'whore', 'emerita', 'consecrated', 'celine', 'wars', 'julius', 'cambrai', 'revolted', 'housekeeper', 'regiment', 'gabriella', 'ywca', 'wests', 'nitra', 'jayne', 'grasshoppers', 'divisie', 'dooley', 'typhoon', 'ayesha', 'ophelia', 'portadown', 'sudha', 'anfield', 'louisa', 'exeter', 'successor', 'sheena', 'sew', 'royals', 'magda', 'muhammed', 'byzantines', 'cesena', 'marshalls', 'stefani', 'retiring', 'trowbridge', 'reassures', 'stylist', 'canon', 'doll', 'rina', 'lateran', 'decommissioning', 'redshirt', 'ramona', 'aston', 'lia', 'vanessa', 'retainers', 'expounded', 'nadine', 'fc', 'dancer', 'gabriela', 'dismissed', 'rockies', 'alison', 'marshals', 'led', 'erika', 'carly', 'trenchard', 'zoe', 'heptathlon', 'liz', 'account', 'yazid', 'kwajalein', 'casco', 'shakedown', 'alanis', 'blige', 'chippenham', 'merchantman', 'drydock', 'kbe', 'cricketing', 'marina', 'münster', 'merthyr', 'redox', 'leyla', 'janelle', 'shogunate', 'paulina', 'dido', 'disciples', 'knighted', 'ottomans', 'promotion', 'umayyad', 'femininity', 'beyoncé', 'catchers', 'fixture', 'man', 'samantha', 'tivoli', 'kangaroos', 'concessions', 'persepolis', 'zsa', 'heinkel', 'munster', 'warden', 'summary', 'giselle', 'suman', 'gillingham', 'harriet', 'melanie', 'prebend', 'heidi', 'suzie', 'steaming', 'claudine', 'armagh', 'esther', 'predecessor', 'riders', 'acb', 'rebellion', 'teutonic', 'tonnage', 'foresters', 'streisand', 'foundered', 'helped', 'trawler', 'conquered', 'had', 'azam', 'conquest', 'patrolled', 'rita', 'hoare', 'senior', 'outfield', 'marsha', 'suspensions', 'netting', 'callas', 'commissioning', 'camila', 'femina', 'canadiens', 'independiente', 'encamped', 'sarah', 'tg', 'aba', 'successors', 'circe', 'leonora', 'wollstonecraft', 'hilde', 'warrant', 'originator', 'order', 'homeward', 'aldershot', 'treasurer', 'katanga', 'susannah', 'prelate', 'helene', 'picket', 'cristina', 'bandits', 'bikini', 'knitting', 'ustaše', 'famer', 'hortense', 'cathedral', 'uss', 'pastor', 'ohl', 'mansur', 'propellers', 'kathryn', 'filipina', 'prue', 'kristin', 'offaly', 'gilda', 'sunita', 'brevet', 'baseman', 'radnički', 'sanction', 'sails', 'unloaded', 'steph', 'superliga', 'leona', 'gallic', 'deborah', 'leagues', 'favour', 'alexandra', 'farmer', 'olivia', 'lincolnshire', 'midwife', 'tow', 'receiver', 'rachel', 'rin', 'waltrip', 'fullback', 'waived', 'browns', 'tanker', 'germania', 'rosanna', 'leicester', 'renegade', 'statistic', 'silvia', 'luton', 'cf', 'meryl', 'gerda', 'lynette', 'isabel', 'keel', 'marisa', 'organist', 'implemented', 'punt', 'theologian', 'lanark', 'nora', 'faithfull', 'brugge', 'utrecht', 'ravens', 'kathy', 'bishop', 'pavia', 'lorde', 'sermons', 'glenda', 'battleship', 'ripon', 'sailing', 'treatises', 'cleopatra', 'albion', 'utica', 'starter', 'angela', 'followers', 'kuznetsova', 'norma', 'errors', 'margo', 'offensively', 'anya', 'francisca', 'antiquities', 'chief', 'wta', 'girl', 'thistle', 'lombards', 'qin', 'natalie', 'siblings', 'kisses', 'upkeep', 'sledge', 'necklace', 'marie', 'dresses', 'pottsville', 'magister', 'grimsby', 'kirsty', 'faye', 'ginny', 'leyton', 'cruised', 'kickoff', 'scarlett', 'rebelled', 'vc', 'loan', 'commander', 'hamasaki', 'guy', 'franciscans', 'fragrance', 'appointed', 'signing', 'alumnae', 'untenable', 'everton', 'maja', 'shamrock', 'prostitute', 'wolverhampton', 'foggia', 'frontier', 'elland', 'alina', 'temporal', 'compiled', 'yelena', 'anorexia', 'minaj', 'promiscuity', 'nla', 'casemates', 'praetor', 'ypres', 'sealift', 'follower', 'lola', 'transsexual', 'sultana', 'helga', 'tottenham', 'bohemians', 'bk', 'wallachia', 'shaolin', 'jammu', 'sheerness', 'siti', 'creation', 'karolina', 'antiquarian', 'rapunzel', 'scunthorpe', 'greta', 'walsall', 'annexing', 'randle', 'relativity', 'linemen', 'steamed', 'asw', 'taz', 'military', 'afghans', 'counterintelligence', 'footballer', 'orcs', 'hispania', 'batting', 'goalie', 'mangeshkar', 'pontius', 'sinead', 'tia', 'was', 'consolidate', 'goalkeeper', 'lata', 'eerste', 'playfair', 'mv', 'army', 'soldier', 'suzanne', 'gigi', 'akiko', 'cavaliers', 'chl', 'concordat', 'brenda', 'gal', 'bowlers', 'shapur', 'brantford', 'sent', 'anchors', 'treasury', 'penalties', 'ella', 'bremerton', 'rms', 'major', 'hooker', 'gwen', 'wingman', 'lombardy', 'nomads', 'pseudoscience', 'boy', 'jeanne', 'midler', 'francesca', 'ulster', 'hulls', 'mother', 'thelma', 'ovaries', 'supermodel', 'archdeacon', 'saladin', 'sterilized', 'jetty', 'playford', 'sheriff', 'general', 'midland', 'flamengo', 'cyndi', 'liège', 'becca', 'candace', 'hasidim', 'brooke', 'carpi', 'halfback', 'confides', 'freya', 'infidelity', 'stadtholder', 'lokomotiv', 'juliet', 'daphne', 'begum', 'contributed', 'eniwetok', 'yokosuka', 'agnieszka', 'watford', 'pompey', 'elizabeth', 'muhammad', 'jesuits', 'hitler', 'quilts', 'darlington', 'cleric', 'chloe', 'tenchi', 'marries', 'martha', 'ballymena', 'childcare', 'eels', 'capone', 'katy', 'serena', 'peterborough', 'fleur', 'haa', 'leafs', 'redskins', 'brie', 'marguerite', 'chiefs', 'brewers', 'yui', 'ingraham', 'maestro', 'hingis', 'brechin', 'xv', 'camped', 'laura', 'gaby', 'kirsten', 'romagna', 'hesketh', 'leinster', 'offices', 'rfu', 'anastasia', 'brigadier', 'keble', 'scissor', 'whittingham', 'mizuki', 'ellie', 'premiership', 'monique', 'thomas', 'amphibious', 'braided', 'vojvodina', 'positional', 'norwich', 'aphrodite', 'padua', 'player', 'divas', 'irma', 'ahl', 'carla', 'eugenie', 'simona', 'scuttled', 'beached', 'maersk', 'adrienne', 'schaffhausen', 'córdoba', 'juanita', 'marlene', 'cosmetics', 'adl', 'relegation', 'parenting', 'cb', 'cécile', 'dealership', 'adria', 'escorting', 'cruises', 'heartbroken', 'liga', 'eloise', 'mentioned', 'inga', 'gillian', 'wisden', 'angelique', 'glamour', 'johanna', 'sind', 'renegades', 'refitting', 'mls', 'barnsley', 'shaanxi', 'her', 'landholdings', 'hera', 'succeeded', 'shakhtar', 'tds', 'praetorian', 'frescoed', 'rendezvoused', 'gauls', 'mia', 'kardashian', 'receptions', 'carranza', 'usha', 'beal', 'pachuca', 'colonels', 'kumari', 'shimmer', 'renée', 'goc', 'stephanie', 'mariana', 'vicar', 'rector', 'natalia', 'leuven', 'pageants', 'shruti', 'position', 'snr', 'agatha', 'batsman', 'bishops', 'establish', 'madeleine', 'divisional', 'leanne', 'eileen', 'charterhouse', 'donna', 'jima', 'taxation', 'twente', 'leila', 'lydia', 'jeanette', 'ships', 'estelle', 'berenice', 'celeste', 'zeller', 'lombardi', 'lilith', 'minelayer', 'daughter', 'radwańska', 'minesweeper', 'amidships', 'audrey', 'administrations', 'ethel', 'moored', 'virgil', 'topless', 'northamptonshire', 'starting', 'stosur', 'dinah', 'lineman', 'corinne', 'subscribed', 'vizier', 'ad', 'tarawa', 'susan', 'herself', 'bank', 'battlecruiser', 'amassing', 'proclaimed', 'millwall', 'burnley', 'cherbourg', 'constitutions', 'commandant', 'seema', 'declared', 'aleksandra', 'argonauts', 'benevento', 'archbishop', 'bisons', 'wanderers', 'gisela', 'profited', 'scrapping', 'corinthians', 'dawkins', 'tearful', 'caitlin', 'janeway', 'avignon', 'gabrielle', 'harrow', 'persephone', 'tracts', 'pro', 'dioecious', 'opponents', 'bella', 'haruka', 'patricia', 'comforted', 'priya', 'jenny', 'officials', 'enchantment', 'hollandia', 'saipan', 'rugby', 'lucille', 'rigging', 'minesweepers', 'ingolstadt', 'montgomeryshire', 'warwickshire', 'umpires', 'fashion', 'patty', 'josephine', 'carlotta', 'fielders', 'rudders', 'perugia', 'tackles', 'treatise', 'those', 'cardinals', 'appoint', 'pussycat', 'afl', 'cervix', 'mullan', 'winehouse', 'panthers', 'erinsborough', 'lisburn', 'rectory', 'gaddafi', 'emily', 'kimberly', 'counterinsurgency', 'stronghold', 'principia', 'mashhad', 'kesha', 'winchester', 'inactivation', 'playoffs', 'decommissioned', 'years', 'toros', 'tomboy', 'boasted', 'successively', 'managerial', 'psv', 'imola', 'promulgated', 'episcopate', 'obtained', 'coyotes', 'loyal', 'safed', 'rank', 'defeat', 'transiting', 'wilhelmshaven', 'eunice', 'cecily', 'spurs', 'outfielder', 'wycombe', 'frémont', 'barca', 'catwalk', 'erin', 'kate', 'stena', 'maru', 'nisha', 'retirement', 'elly', 'commentaries', 'engineers', 'aquinas', 'cfl', 'roughriders', 'katarina', 'platoon', 'punting', 'male', 'thea', 'statistical', 'shakira', 'princeps', 'sylvie', 'circuits', 'knots', 'artemis', 'harbor', 'infertility', 'vera', 'fr', 'lorna', 'shelbourne', 'linebacker', 'towing', 'backfield', 'charlton', 'menstrual', 'nervosa', 'huracán', 'mildred', 'durham', 'jharkhand', 'scientific', 'lucy', 'peggy', 'seductive', 'pontifical', 'rosie', 'worcester', 'curate', 'lamborghini', 'suspended', 'hayley', 'irena', 'she', 'mademoiselle', 'adela', 'rebels', 'electropop', 'yorkshire', 'wiltshire', 'assists', 'elsie', 'mughals', 'luoyang', 'englishman', 'lst', 'near', 'carina', 'jenna', 'rewarded', 'he', 'tonality', 'sunk', 'liza', 'peshawar', 'canons', 'cora', 'milltown', 'employed', 'deputy', 'meg', 'wendy', 'haydn', 'suffragette', 'ecclesiastical', 'journeyman', 'hardline', 'annabel', 'faisalabad', 'ingeborg', 'moldavia', 'mythos', 'yvonne', 'lotta', 'oiler', 'zemun', 'sank', 'bologna', 'romans', 'britney', 'sexy', 'irina', 'kitchener', 'velma', 'sweeps', 'robyn', 'racking', 'granted', 'ferried', 'bebe', 'speight', 'chaplain', 'katherine', 'rawlinson', 'resigned', 'ingrid', 'transjordan', 'chetniks', 'feelings', 'netted', 'ayumi', 'posse', 'phoebe', 'tiffany', 'baroda', 'blanche', 'elsa', 'unloading', 'speculators', 'popper', 'buffalo', 'bubblegum', 'azalea', 'playmate', 'xi', 'veils', 'forlì', 'burnet', 'irene', 'vicarage', 'broncos', 'patents', 'capsized', 'titania', 'madeline', 'colchester', 'apprenticed', 'civil', 'hc', 'sortied', 'punter', 'sparingly', 'feyenoord', 'cheryl', 'rovers', 'dower', 'episcopacy', 'leaders', 'fielding', 'earhart', 'transgender', 'mitosis', 'cindy', 'in', 'leeds', 'tuam', 'corset', 'dso', 'raquel', 'constance', 'cleavage', 'commissary', 'spouse', 'melina', 'galina']\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pairs used in PCA:  10\n"
     ]
    }
   ],
   "source": [
    "from utils import extract_vectors\n",
    "from utils import train_and_predict\n",
    "from utils import doPCA, drop\n",
    "\n",
    "size=1000\n",
    "sorted_g = sorted(gender_bias_bef.items(), key=operator.itemgetter(1))\n",
    "female = [item[0] for item in sorted_g[:size]]\n",
    "male = [item[0] for item in sorted_g[-size:]]\n",
    "y_true = [1]*size + [0]*size\n",
    "\n",
    "c_vocab = list(set(male + female + [word for word in definitional_words if word in w2i]))\n",
    "c_w2i = dict()\n",
    "for idx, w in enumerate(c_vocab):\n",
    "    c_w2i[w] = idx\n",
    "    \n",
    "precisions = []\n",
    "    \n",
    "for component_id in range(20):\n",
    "    \n",
    "    print('component id: ', component_id)\n",
    "    \n",
    "    wv_debiased = hard_debias(wv, w2i, w2i_partial = c_w2i, vocab_partial = c_vocab, component_ids = [component_id])\n",
    "    \n",
    "#     print(wv_debiased[0, :])\n",
    "#     print(wv_debiased.shape[1])\n",
    "    \n",
    "    # save to output file\n",
    "    filename=\"data/dhd_glove_reproduce.p\"\n",
    "    with open(filename, 'ab') as fp:\n",
    "        pickle.dump(wv_debiased,fp)\n",
    "    \n",
    "    _, _, _, preci = cluster_and_visualize(male + female, \n",
    "                                           extract_vectors(male + female, wv_debiased, c_w2i), 1, y_true)\n",
    "    precisions.append(preci)\n",
    "\n",
    "# remove the 2nd component and save the word embeddings in a file\n",
    "component_id=1 \n",
    "wv_debiased = hard_debias(wv, w2i, w2i_partial = w2i, vocab_partial = vocab, component_ids = [component_id])\n",
    "# save to output file\n",
    "filename=\"data/dhd_glove_reproduce.p\"\n",
    "with open(filename, 'ab') as fp:\n",
    "    pickle.dump(wv_debiased,fp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Create some mock data\n",
    "t = np.arange(1, 21)\n",
    "data1 = precisions\n",
    "\n",
    "fig, ax1 = plt.subplots(figsize=(6,2.8))\n",
    "\n",
    "color = 'red'\n",
    "ax1.set_xlabel('Project out the D-th directions', fontsize=17)\n",
    "ax1.set_ylabel('accuracy', fontsize=17)\n",
    "ax1.scatter(t, data1, color=color, label='GloVe', marker = 'x', s=60)\n",
    "plt.xticks([2,4,6,8,10, 12, 14, 16 ,18, 20], fontsize=15)\n",
    "ax1.tick_params(axis='y', labelsize=14)\n",
    "ax1.set_ylim(0.65, 0.84)\n",
    "ax1.legend(loc='lower right', frameon=True, fontsize='large')\n",
    "ax1.grid(axis='y')\n",
    "\n",
    "fig.tight_layout()  # otherwise the right y-label is slightly clipped\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
