{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[]},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["#Predictions_on_uncensored_and_censored_data\n","#### This notebook loads the Blodget_50k.csv data, both the bert classifier models which were build using 03_Train_BERT_Models.ipynb notebook, and both the dictionaries which were generated using 04_Train_word2vec_model_and_create_word_replacement_dictionary.ipynb notebook.\n","\n","#### We classify the tweets first before replacing the words and later do the same classification on the words that replaced the swear words."],"metadata":{"id":"SdhdAOvKn-bc"}},{"cell_type":"markdown","source":["## Importing the libraries"],"metadata":{"id":"lhbtXAi4w6CC"}},{"cell_type":"code","source":["\n","# A dependency of the preprocessing for BERT inputs\n","!pip install -q -U \"tensorflow-text==2.8.*\"\n","#You will use the AdamW optimizer from tensorflow/models.\n","!pip install -q tf-models-official==2.7.0"],"metadata":{"id":"YtsQeqRL_U8I"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":null,"metadata":{"id":"NmF21M77myiP"},"outputs":[],"source":["import os\n","import shutil\n","import pandas as pd\n","import tensorflow as tf\n","import tensorflow_hub as hub\n","import tensorflow_text as text\n","from official.nlp import optimization  # to create AdamW optimizer\n","from pandas.core.window.common import defaultdict\n","import matplotlib.pyplot as plt\n","import random\n","import numpy as np\n","tf.get_logger().setLevel('ERROR')\n","import matplotlib.pyplot as plt\n","\n"]},{"cell_type":"markdown","source":["# Loading the bert classifier to predict the class of tweets"],"metadata":{"id":"yBoAQBt83OP0"}},{"cell_type":"code","source":["#Loading the Bert Classifiers which we obtained after running 03_Train_BERT_Models.ipynb notebook\n","\n","model_bert_1 = tf.saved_model.load(\"/home/cc/faunta_bert\")\n","model_bert_2 = tf.saved_model.load(\"/home/cc/davidson_bert\")\n","\n","#function to apply on the entire tweet text dataset\n","def predict(text):\n","  category = model_bert_1(tf.constant([text]))\n","  result = np.array(category)\n","  result = np.argmax(result)\n","  if result == 0:\n","    return \"abusive\"\n","  elif result == 1:\n","    return \"hateful\"\n","  elif result == 2:\n","    return \"normal\"\n","  else:\n","    return \"spam\"\n","\n","def predict2(text):\n","  category = model_bert_2(tf.constant([text]))\n","  result = np.array(category)\n","  result = np.argmax(result)\n","  if result == 0:\n","    return \"hate\"\n","  elif result == 1:\n","    return \"neither\"\n","  elif result == 2:\n","    return \"offensive\"\n","\n"],"metadata":{"id":"7RHt8Q4K3P1X"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["tweet_data = pd.read_csv(\"Blodget_50k.csv\")"],"metadata":{"id":"vxJFd9PzkapJ"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["tweet_data[\"predictions_dwmw\"] = tweet_data[\"tweet_text\"].apply(predict2)\n","tweet_data[\"predictions_fdcl\"] = tweet_data[\"tweet_text\"].apply(predict)"],"metadata":{"id":"2rRZvZdvd5ZL"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Replacing swear words with replacement word by dictionary made by word2vec model trained on Google news 300m dataset and classifying using both BERT classifie"],"metadata":{"id":"MiVFvhrLNPzt"}},{"cell_type":"code","source":["with open('google_news_dict.json') as f:\n","    replace_dict_1 = json.load(f)"],"metadata":{"id":"-5XchjeI221p"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def reword(text):\n","  sentence = text.split()\n","  for i in range(len(sentence)):\n","    if sentence[i] in replace_dict_1:\n","      sentence[i] = replace_dict_1[sentence[i]]\n","  return \" \".join(sentence)\n","\n","\n","tweet_data.loc[:,'reworded_1'] = tweet_data[\"tweet_text\"].apply(reword)"],"metadata":{"id":"EZJJe7QrFyi6"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["prediction = tweet_data['reworded_1'].apply(predict)\n","prediction2 = tweet_data['reworded_1'].apply(predict2)\n","tweet_data.loc[:,'fdcl_predictions_on_reworded_1'] = prediction\n","tweet_data.loc[:,'dwmw_predictions_on_reworded_1'] = prediction2"],"metadata":{"id":"o8V5cnlBPzJR"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["reworded_sample = tweet_data[tweet_data[\"tweet_text\"] != tweet_data['reworded_1']]"],"metadata":{"id":"9p2pJpHvs782"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Replacing swear words with replacement word by dictionary made by word2vec model trained on Blodgett AAE dataset and classifying using both BERT classifier\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"],"metadata":{"id":"7aJbrGdj4E16"}},{"cell_type":"code","source":["with open('Blodgett_dict.json') as f:\n","    replace_dict_2 = json.load(f)"],"metadata":{"id":"tlq-LynQ3RrE"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def reword(text):\n","  sentence = text.split()\n","  for i in range(len(sentence)):\n","    if sentence[i] in replace_dict_2:\n","      sentence[i] = replace_dict_2[sentence[i]]\n","  return \" \".join(sentence)\n","\n","\n","tweet_data.loc[:,'reworded_2'] = tweet_data[\"tweet_text\"].apply(reword)"],"metadata":{"id":"2oNCLteiz-U-"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["prediction = tweet_data['reworded_2'].apply(predict)\n","tweet_data.loc[:,'predictions_on_reworded_2'] = prediction"],"metadata":{"id":"jJ4cHx_H4K5y"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["prediction = tweet_data['reworded_2'].apply(predict2)\n","tweet_data.loc[:,'Dwmw_predictions_on_reworded_2'] = prediction"],"metadata":{"id":"bUkkMjV9sPcH"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## word replacement using asterics in place of swear words and classifying using both BERT classifier"],"metadata":{"id":"XX1ueXTq4Beb"}},{"cell_type":"code","source":["with open(\"/home/cc/LIWC_swear_dictionary.txt\") as f:\n","  lines = f.readlines()\n","  word_set = set([l.split()[0] for l in lines])\n","\n","def reword5(text):\n","  words = text.split()\n","  for i in range(len(words)):\n","    if words[i] in word_set:\n","      words[i] = \"*\"*len(words[i])\n","  return \" \".join(words)"],"metadata":{"id":"nv9a7MbtqVEg"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["tweet_data.loc[:,\"censored\"] = tweet_data[\"tweet_text\"].apply(reword5)"],"metadata":{"id":"UVJigpLS0l-C"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["tweet_data.loc[:,'fdcl_predictions_on_censored'] = tweet_data['censored'].apply(predict)\n","tweet_data.loc[:,'dwmw_predictions_on_censored'] = tweet_data['censored'].apply(predict2)"],"metadata":{"id":"-9h9w-q8-8II"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["tweet_data.to_csv(\"tweet_data.csv\",index=False)"],"metadata":{"id":"m1TK2c1ZMcya"},"execution_count":null,"outputs":[]}]}