{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(341, 48)\n",
      "Index(['level_0', 'Unnamed: 0', 'sessionid', 'content', 'database', 'repeat',\n",
      "       'nCopies', 'copyIDs', 'phq', 'demographics', 'tPrompt', 'twitter',\n",
      "       'hasTwitter', 'mobile', 'date', 'Age', 'Gender', 'Student',\n",
      "       'PriorTreatement', 'Group', 'Covid', 'item1', 'item2', 'item3', 'item4',\n",
      "       'item5', 'item6', 'item7', 'item8', 'item9', 'phq9', 'lentext',\n",
      "       'lenCon', 'lenTweets', 'sharedemographics', 'sharetPrompt',\n",
      "       'shareaPrompt', 'shareaudio', 'sharecalendar', 'shareclog',\n",
      "       'sharecontact', 'sharetlog', 'sharegps', 'shareusername', 'sharetweets',\n",
      "       'index', 'Covid1', 'Covid2'],\n",
      "      dtype='object')\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>level_0</th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>sessionid</th>\n",
       "      <th>content</th>\n",
       "      <th>database</th>\n",
       "      <th>repeat</th>\n",
       "      <th>nCopies</th>\n",
       "      <th>copyIDs</th>\n",
       "      <th>phq</th>\n",
       "      <th>demographics</th>\n",
       "      <th>...</th>\n",
       "      <th>sharecalendar</th>\n",
       "      <th>shareclog</th>\n",
       "      <th>sharecontact</th>\n",
       "      <th>sharetlog</th>\n",
       "      <th>sharegps</th>\n",
       "      <th>shareusername</th>\n",
       "      <th>sharetweets</th>\n",
       "      <th>index</th>\n",
       "      <th>Covid1</th>\n",
       "      <th>Covid2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8181</td>\n",
       "      <td>\"My favorite place is any bubble tea cafe beca...</td>\n",
       "      <td>summer</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>['{\"Q0\":\"0\",\"Q1\":\"1\",\"Q2\":\"0\",\"Q3\":\"0\",\"Q4\":\"1...</td>\n",
       "      <td>['{\"Q0\":\"18-23\",\"Q1\":\"Woman\",\"Q2\":\"Yes, I am a...</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>8170</td>\n",
       "      <td>\"My favorite place is around the people I love.\"</td>\n",
       "      <td>summer</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>['{\"Q0\":\"1\",\"Q1\":\"1\",\"Q2\":\"0\",\"Q3\":\"1\",\"Q4\":\"1...</td>\n",
       "      <td>['{\"Q0\":\"18-23\",\"Q1\":\"Woman\",\"Q2\":\"Yes, I am a...</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 48 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   level_0  Unnamed: 0  sessionid  \\\n",
       "0        0           0       8181   \n",
       "1        1           1       8170   \n",
       "\n",
       "                                             content database  repeat  \\\n",
       "0  \"My favorite place is any bubble tea cafe beca...   summer     NaN   \n",
       "1   \"My favorite place is around the people I love.\"   summer     NaN   \n",
       "\n",
       "   nCopies copyIDs                                                phq  \\\n",
       "0      NaN     NaN  ['{\"Q0\":\"0\",\"Q1\":\"1\",\"Q2\":\"0\",\"Q3\":\"0\",\"Q4\":\"1...   \n",
       "1      NaN     NaN  ['{\"Q0\":\"1\",\"Q1\":\"1\",\"Q2\":\"0\",\"Q3\":\"1\",\"Q4\":\"1...   \n",
       "\n",
       "                                        demographics  ... sharecalendar  \\\n",
       "0  ['{\"Q0\":\"18-23\",\"Q1\":\"Woman\",\"Q2\":\"Yes, I am a...  ...           1.0   \n",
       "1  ['{\"Q0\":\"18-23\",\"Q1\":\"Woman\",\"Q2\":\"Yes, I am a...  ...           1.0   \n",
       "\n",
       "  shareclog sharecontact  sharetlog  sharegps shareusername sharetweets index  \\\n",
       "0       1.0          1.0        1.0       0.0             0         0.0   NaN   \n",
       "1       1.0          1.0        1.0       1.0             0         0.0   NaN   \n",
       "\n",
       "  Covid1 Covid2  \n",
       "0    NaN    NaN  \n",
       "1    NaN    NaN  \n",
       "\n",
       "[2 rows x 48 columns]"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Author: ML Tlachac, WPI\n",
    "#For StudentSADD, 2021\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "data = pd.read_csv(\"extractedDataShort.csv\")\n",
    "data = data[data.sharetPrompt == 1].reset_index()\n",
    "print(data.shape)\n",
    "print(data.columns)\n",
    "data[:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>phq9</th>\n",
       "      <th>q9</th>\n",
       "      <th>Content</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>8181</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>\"My favorite place is any bubble tea cafe beca...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8170</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>\"My favorite place is around the people I love.\"</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     ID  phq9  q9                                            Content\n",
       "0  8181     3   0  \"My favorite place is any bubble tea cafe beca...\n",
       "1  8170     4   0   \"My favorite place is around the people I love.\""
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pDFt = pd.DataFrame()\n",
    "pDFt[\"ID\"] = data.sessionid\n",
    "pDFt[\"phq9\"] = data[\"phq9\"]\n",
    "pDFt[\"q9\"] = data[\"item9\"]\n",
    "pDFt[\"Content\"] = data.content\n",
    "pDFt[:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate length and remove quotes from content\n",
    "content = []\n",
    "characters = []\n",
    "words = []\n",
    "for i in range(0, pDFt.shape[0]):\n",
    "    pDFt.Content[i][1:-1]\n",
    "    characters.append(len(pDFt.Content[i][1:-1]))\n",
    "    words.append(len(pDFt.Content[i][1:-1].split(\" \")))\n",
    "    if i <= 133:\n",
    "        content.append(pDFt.Content[i][1:-1])\n",
    "    else:\n",
    "        content.append(pDFt.Content[i])\n",
    "        \n",
    "pDFt[\"Content\"] = content\n",
    "pDFt[\"Characters\"] = characters\n",
    "pDFt[\"Words\"] = words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from textblob import TextBlob\n",
    "import textblob as tb\n",
    "import json\n",
    "import nltk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "1\n",
      "2\n",
      "3\n",
      "4\n",
      "5\n",
      "6\n",
      "7\n",
      "8\n",
      "9\n",
      "10\n",
      "11\n",
      "12\n",
      "13\n",
      "14\n",
      "15\n",
      "16\n",
      "17\n",
      "18\n",
      "19\n",
      "20\n",
      "21\n",
      "22\n",
      "23\n",
      "24\n",
      "25\n",
      "26\n",
      "27\n",
      "28\n",
      "29\n",
      "30\n",
      "31\n",
      "32\n",
      "33\n",
      "34\n",
      "35\n",
      "36\n",
      "37\n",
      "38\n",
      "39\n",
      "40\n",
      "41\n",
      "42\n",
      "43\n",
      "44\n",
      "45\n",
      "46\n",
      "47\n",
      "48\n",
      "49\n",
      "50\n",
      "51\n",
      "52\n",
      "53\n",
      "54\n",
      "55\n",
      "56\n",
      "57\n",
      "58\n",
      "59\n",
      "60\n",
      "61\n",
      "62\n",
      "63\n",
      "64\n",
      "65\n",
      "66\n",
      "67\n",
      "68\n",
      "69\n",
      "70\n",
      "71\n",
      "72\n",
      "73\n",
      "74\n",
      "75\n",
      "76\n",
      "77\n",
      "78\n",
      "79\n",
      "80\n",
      "81\n",
      "82\n",
      "83\n",
      "84\n",
      "85\n",
      "86\n",
      "87\n",
      "88\n",
      "89\n",
      "90\n",
      "91\n",
      "92\n",
      "93\n",
      "94\n",
      "95\n",
      "96\n",
      "97\n",
      "98\n",
      "99\n",
      "100\n",
      "101\n",
      "102\n",
      "103\n",
      "104\n",
      "105\n",
      "106\n",
      "107\n",
      "108\n",
      "109\n",
      "110\n",
      "111\n",
      "112\n",
      "113\n",
      "114\n",
      "115\n",
      "116\n",
      "117\n",
      "118\n",
      "119\n",
      "120\n",
      "121\n",
      "122\n",
      "123\n",
      "124\n",
      "125\n",
      "126\n",
      "127\n",
      "128\n",
      "129\n",
      "130\n",
      "131\n",
      "132\n",
      "133\n",
      "134\n",
      "135\n",
      "136\n",
      "137\n",
      "138\n",
      "139\n",
      "140\n",
      "141\n",
      "142\n",
      "143\n",
      "144\n",
      "145\n",
      "146\n",
      "147\n",
      "148\n",
      "149\n",
      "150\n",
      "151\n",
      "152\n",
      "153\n",
      "154\n",
      "155\n",
      "156\n",
      "157\n",
      "158\n",
      "159\n",
      "160\n",
      "161\n",
      "162\n",
      "163\n",
      "164\n",
      "165\n",
      "166\n",
      "167\n",
      "168\n",
      "169\n",
      "170\n",
      "171\n",
      "172\n",
      "173\n",
      "174\n",
      "175\n",
      "176\n",
      "177\n",
      "178\n",
      "179\n",
      "180\n",
      "181\n",
      "182\n",
      "183\n",
      "184\n",
      "185\n",
      "186\n",
      "187\n",
      "188\n",
      "189\n",
      "190\n",
      "191\n",
      "192\n",
      "193\n",
      "194\n",
      "195\n",
      "196\n",
      "197\n",
      "198\n",
      "199\n",
      "200\n",
      "201\n",
      "202\n",
      "203\n",
      "204\n",
      "205\n",
      "206\n",
      "207\n",
      "208\n",
      "209\n",
      "210\n",
      "211\n",
      "212\n",
      "213\n",
      "214\n",
      "215\n",
      "216\n",
      "217\n",
      "218\n",
      "219\n",
      "220\n",
      "221\n",
      "222\n",
      "223\n",
      "224\n",
      "225\n",
      "226\n",
      "227\n",
      "228\n",
      "229\n",
      "230\n",
      "231\n",
      "232\n",
      "233\n",
      "234\n",
      "235\n",
      "236\n",
      "237\n",
      "238\n",
      "239\n",
      "240\n",
      "241\n",
      "242\n",
      "243\n",
      "244\n",
      "245\n",
      "246\n",
      "247\n",
      "248\n",
      "249\n",
      "250\n",
      "251\n",
      "252\n",
      "253\n",
      "254\n",
      "255\n",
      "256\n",
      "257\n",
      "258\n",
      "259\n",
      "260\n",
      "261\n",
      "262\n",
      "263\n",
      "264\n",
      "265\n",
      "266\n",
      "267\n",
      "268\n",
      "269\n",
      "270\n",
      "271\n",
      "272\n",
      "273\n",
      "274\n",
      "275\n",
      "276\n",
      "277\n",
      "278\n",
      "279\n",
      "280\n",
      "281\n",
      "282\n",
      "283\n",
      "284\n",
      "285\n",
      "286\n",
      "287\n",
      "288\n",
      "289\n",
      "290\n",
      "291\n",
      "292\n",
      "293\n",
      "294\n",
      "295\n",
      "296\n",
      "297\n",
      "298\n",
      "299\n",
      "300\n",
      "301\n",
      "302\n",
      "303\n",
      "304\n",
      "305\n",
      "306\n",
      "307\n",
      "308\n",
      "309\n",
      "310\n",
      "311\n",
      "312\n",
      "313\n",
      "314\n",
      "315\n",
      "316\n",
      "317\n",
      "318\n",
      "319\n",
      "320\n",
      "321\n",
      "322\n",
      "323\n",
      "324\n",
      "325\n",
      "326\n",
      "327\n",
      "328\n",
      "329\n",
      "330\n",
      "331\n",
      "332\n",
      "333\n",
      "334\n",
      "335\n",
      "336\n",
      "337\n",
      "338\n",
      "339\n",
      "340\n"
     ]
    }
   ],
   "source": [
    "#extract polarity, subjectivity, and part of speech tags for each word, stored as list for now\n",
    "tags = []\n",
    "for i in range(0, len(pDFt.ID)):\n",
    "    print(i)\n",
    "    tags2 = []\n",
    "    for text in pDFt.Content[i]:\n",
    "        T = TextBlob(str(text))\n",
    "        for word, tag in T.tags:\n",
    "            tags2.append(tag)\n",
    "    tags.append(tags2)\n",
    "pDFt[\"POStags\"] = tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['help', 'office', 'dance', 'money', 'wedding', 'domestic_work', 'sleep', 'medical_emergency', 'cold', 'hate', 'cheerfulness', 'aggression', 'occupation', 'envy', 'anticipation', 'family', 'vacation', 'crime', 'attractive', 'masculine', 'prison', 'health', 'pride', 'dispute', 'nervousness', 'government', 'weakness', 'horror', 'swearing_terms', 'leisure', 'suffering', 'royalty', 'wealthy', 'tourism', 'furniture', 'school', 'magic', 'beach', 'journalism', 'morning', 'banking', 'social_media', 'exercise', 'night', 'kill', 'blue_collar_job', 'art', 'ridicule', 'play', 'computer', 'college', 'optimism', 'stealing', 'real_estate', 'home', 'divine', 'sexual', 'fear', 'irritability', 'superhero', 'business', 'driving', 'pet', 'childish', 'cooking', 'exasperation', 'religion', 'hipster', 'internet', 'surprise', 'reading', 'worship', 'leader', 'independence', 'movement', 'body', 'noise', 'eating', 'medieval', 'zest', 'confusion', 'water', 'sports', 'death', 'healing', 'legend', 'heroic', 'celebration', 'restaurant', 'violence', 'programming', 'dominant_heirarchical', 'military', 'neglect', 'swimming', 'exotic', 'love', 'hiking', 'communication', 'hearing', 'order', 'sympathy', 'hygiene', 'weather', 'anonymity', 'trust', 'ancient', 'deception', 'fabric', 'air_travel', 'fight', 'dominant_personality', 'music', 'vehicle', 'politeness', 'toy', 'farming', 'meeting', 'war', 'speaking', 'listen', 'urban', 'shopping', 'disgust', 'fire', 'tool', 'phone', 'gain', 'sound', 'injury', 'sailing', 'rage', 'science', 'work', 'appearance', 'valuable', 'warmth', 'youth', 'sadness', 'fun', 'emotional', 'joy', 'affection', 'traveling', 'fashion', 'ugliness', 'lust', 'shame', 'torment', 'economics', 'anger', 'politics', 'ship', 'clothing', 'car', 'strength', 'technology', 'breaking', 'shape_and_size', 'power', 'white_collar_job', 'animal', 'party', 'terrorism', 'smell', 'disappointment', 'poor', 'plant', 'pain', 'beauty', 'timidity', 'philosophy', 'negotiate', 'negative_emotion', 'cleaning', 'messaging', 'competing', 'law', 'friends', 'payment', 'achievement', 'alcohol', 'liquid', 'feminine', 'weapon', 'children', 'monster', 'ocean', 'giving', 'contentment', 'writing', 'rural', 'positive_emotion', 'musical', 'colors', 'text_abbreviations']\n"
     ]
    }
   ],
   "source": [
    "from empath import Empath\n",
    "import re\n",
    "\n",
    "#create list of all empath categories\n",
    "\n",
    "lexicon = Empath()\n",
    "emp = lexicon.analyze(\"Testing\", normalize=True)\n",
    "wordlist = []\n",
    "for word, value in emp.items():\n",
    "    wordlist.append(word)\n",
    "print(wordlist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "help\n",
      "office\n",
      "dance\n",
      "money\n",
      "wedding\n",
      "domestic_work\n",
      "sleep\n",
      "medical_emergency\n",
      "cold\n",
      "hate\n",
      "cheerfulness\n",
      "aggression\n",
      "occupation\n",
      "envy\n",
      "anticipation\n",
      "family\n",
      "vacation\n",
      "crime\n",
      "attractive\n",
      "masculine\n",
      "prison\n",
      "health\n",
      "pride\n",
      "dispute\n",
      "nervousness\n",
      "government\n",
      "weakness\n",
      "horror\n",
      "swearing_terms\n",
      "leisure\n",
      "suffering\n",
      "royalty\n",
      "wealthy\n",
      "tourism\n",
      "furniture\n",
      "school\n",
      "magic\n",
      "beach\n",
      "journalism\n",
      "morning\n",
      "banking\n",
      "social_media\n",
      "exercise\n",
      "night\n",
      "kill\n",
      "blue_collar_job\n",
      "art\n",
      "ridicule\n",
      "play\n",
      "computer\n",
      "college\n",
      "optimism\n",
      "stealing\n",
      "real_estate\n",
      "home\n",
      "divine\n",
      "sexual\n",
      "fear\n",
      "irritability\n",
      "superhero\n",
      "business\n",
      "driving\n",
      "pet\n",
      "childish\n",
      "cooking\n",
      "exasperation\n",
      "religion\n",
      "hipster\n",
      "internet\n",
      "surprise\n",
      "reading\n",
      "worship\n",
      "leader\n",
      "independence\n",
      "movement\n",
      "body\n",
      "noise\n",
      "eating\n",
      "medieval\n",
      "zest\n",
      "confusion\n",
      "water\n",
      "sports\n",
      "death\n",
      "healing\n",
      "legend\n",
      "heroic\n",
      "celebration\n",
      "restaurant\n",
      "violence\n",
      "programming\n",
      "dominant_heirarchical\n",
      "military\n",
      "neglect\n",
      "swimming\n",
      "exotic\n",
      "love\n",
      "hiking\n",
      "communication\n",
      "hearing\n",
      "order\n",
      "sympathy\n",
      "hygiene\n",
      "weather\n",
      "anonymity\n",
      "trust\n",
      "ancient\n",
      "deception\n",
      "fabric\n",
      "air_travel\n",
      "fight\n",
      "dominant_personality\n",
      "music\n",
      "vehicle\n",
      "politeness\n",
      "toy\n",
      "farming\n",
      "meeting\n",
      "war\n",
      "speaking\n",
      "listen\n",
      "urban\n",
      "shopping\n",
      "disgust\n",
      "fire\n",
      "tool\n",
      "phone\n",
      "gain\n",
      "sound\n",
      "injury\n",
      "sailing\n",
      "rage\n",
      "science\n",
      "work\n",
      "appearance\n",
      "valuable\n",
      "warmth\n",
      "youth\n",
      "sadness\n",
      "fun\n",
      "emotional\n",
      "joy\n",
      "affection\n",
      "traveling\n",
      "fashion\n",
      "ugliness\n",
      "lust\n",
      "shame\n",
      "torment\n",
      "economics\n",
      "anger\n",
      "politics\n",
      "ship\n",
      "clothing\n",
      "car\n",
      "strength\n",
      "technology\n",
      "breaking\n",
      "shape_and_size\n",
      "power\n",
      "white_collar_job\n",
      "animal\n",
      "party\n",
      "terrorism\n",
      "smell\n",
      "disappointment\n",
      "poor\n",
      "plant\n",
      "pain\n",
      "beauty\n",
      "timidity\n",
      "philosophy\n",
      "negotiate\n",
      "negative_emotion\n",
      "cleaning\n",
      "messaging\n",
      "competing\n",
      "law\n",
      "friends\n",
      "payment\n",
      "achievement\n",
      "alcohol\n",
      "liquid\n",
      "feminine\n",
      "weapon\n",
      "children\n",
      "monster\n",
      "ocean\n",
      "giving\n",
      "contentment\n",
      "writing\n",
      "rural\n",
      "positive_emotion\n",
      "musical\n",
      "colors\n",
      "text_abbreviations\n"
     ]
    }
   ],
   "source": [
    "#calculate frequency for each empath category\n",
    "for word in wordlist:\n",
    "    print(word)\n",
    "    pctt = []\n",
    "    for i in range(0, pDFt.shape[0]):\n",
    "        content = re.sub(r'[^\\w\\s]', '', str(pDFt.Content[i]).lower())\n",
    "        lexicon = Empath()\n",
    "        emp = lexicon.analyze(content, categories=[word], normalize = True)\n",
    "        if emp != None:\n",
    "            for key, value in emp.items():\n",
    "                pctt.append(value)\n",
    "        else:\n",
    "            pctt.append(0)\n",
    "    pDFt[word] = pctt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'NN', 'PRP', 'CD', 'DT', 'SYM', 'NNS', 'NNP'}\n"
     ]
    }
   ],
   "source": [
    "#get list of Part of Speech (POS) tags\n",
    "posTags = []\n",
    "for i in range(0, pDFt.shape[0]):\n",
    "    for tag in pDFt.POStags[i]:\n",
    "        posTags.append(tag)\n",
    "posSet = set(posTags)\n",
    "print(posSet)\n",
    "\n",
    "#POS tag counting\n",
    "poswordst = []\n",
    "for posList in pDFt.POStags:\n",
    "    poswordst.append(len(posList))\n",
    "#pDFt[\"WordsTags\"] = poswordst\n",
    "\n",
    "for tag in posSet:\n",
    "    cnt = []\n",
    "    for posList in pDFt.POStags:\n",
    "        counter = 0\n",
    "        for item in posList:\n",
    "            if item == tag:\n",
    "                counter += 1\n",
    "        cnt.append(counter)\n",
    "    pDFt[tag] = cnt\n",
    "\n",
    "#turn tag count into frequencies\n",
    "for tag in posSet:\n",
    "    freq = []\n",
    "    for i in range(0, pDFt.shape[0]):\n",
    "        freq.append(pDFt[tag][i]/poswordst[i])\n",
    "    pDFt[tag] = freq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "pDFt.to_csv(\"tPromptTextFeatures.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
