{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[]},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["We are training our word2vec models here based on the Blodgett dataset. \n","We are also using a pre-trained word2vec model on the Google News 300 dataset. \n","\n","Blodgett_dict.json and google_news_dict.json are the dictionaries that are created by doing this, but we have uploaded a copies of the same in our supplementary materials.\n","\n","LIWC_swear_dictionary.txt is the LIWC dictionary that was used by the authors in the original paper and we also used the same. This documentation for LIWC2007 can be found at: http://www.gruberpeplab.com/teaching/psych231_fall2013/documents/231_Pennebaker2007.pdf\n","However, we have also uploaded a copy of this dictionary in our supplementary materials. "],"metadata":{"id":"XWmV8ZNLsQPb"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"AUJJsgpAph6N"},"outputs":[],"source":["import gensim\n","import gensim.downloader\n","import json\n","import pandas as pd"]},{"cell_type":"code","source":["# Uncomment the first line if you don't have already downloaded the gensim-data which contains the pretrained word2vec model\n","\n","#model_gn = gensim.downloader.load('word2vec-google-news-300')\n","model_gn = gensim.models.KeyedVectors.load_word2vec_format(\"/home/cc/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz\", binary=True) \n","replace_dict_1 = {}\n","\n","\n","with open(\"/home/cc/LIWC_swear_dictionary.txt\") as f:\n","  lines = f.readlines()\n","  word_set = set([l.split()[0] for l in lines])\n","\n","for key in word_set:\n","  try:\n","    val = model_gn.most_similar(key,topn=20)[0][0]\n","    i = 1\n","    while val in key or key in val or val in word_set:\n","      val = model_gn.most_similar(key)[i][0]\n","      i += 1\n","    replace_dict_1[key]=val\n","      \n","  except (KeyError, IndexError):\n","    pass"],"metadata":{"id":"WzT24U3qpryb"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["with open(\"google_news_dict.json\",\"w\") as f:\n","  json.dump(replace_dict_1,f)"],"metadata":{"id":"oES7XMR0p6gD"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["#Blodget_dictionary"],"metadata":{"id":"PQTj8SQevX9E"}},{"cell_type":"code","source":["# Loading the dataset\n","df = pd.read_csv(\"Blodgett_500k.csv\")\n","\n","#preprocessing and cleaning the text data that has to be trained\n","\n","review_text = df.tweet_text.apply(gensim.utils.simple_preprocess)\n","\n","#loading the model\n","\n","model = gensim.models.Word2Vec(\n","    window=10,\n","    min_count=2,\n","    workers=4,\n",")\n","\n","#Building vocab\n","\n","model.build_vocab(review_text, progress_per=1000)\n","\n","#training the word2vec model\n","\n","model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"JNpf6eBAtR23","outputId":"787bb7db-8dac-4b2a-9337-d034efb1fbc9"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(16880938, 21629045)"]},"metadata":{},"execution_count":28}]},{"cell_type":"code","source":["replace_dict_2 = {}\n","\n","with open(\"LIWC_swear_dictionary.txt\") as f:\n","  lines = f.readlines()\n","  word_set = set([l.split()[0] for l in lines])\n","\n","for key in word_set:\n","  try:\n","    val = model.wv.most_similar(key)[0][0]\n","    i = 1\n","    while val in key or key in val or val in word_set:\n","      val = model.wv.most_similar(key)[i][0]\n","      i += 1\n","    replace_dict_2[key]=val\n","      \n","  except (KeyError, IndexError):\n","    #replace_dict_2[key] = \"None\"\n","    pass"],"metadata":{"id":"F51KZ4dEtSp0"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["with open(\"Blodgett_dict.json\",\"w\") as f:\n","  json.dump(replace_dict_2,f)"],"metadata":{"id":"oGs7NoTpvAGT"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":[],"metadata":{"id":"vHuZQzXTvDTc"},"execution_count":null,"outputs":[]}]}