{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Obtain the tweet objects (features) from Twitter API"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "- Hydrate using Twarc library to get Tweet Json objs from tweetids (of detected coordinated and normal accounts)\n",
    "```\n",
    "pip3 install twarc\n",
    "pip3 install tqdm\n",
    "twarc configure\n",
    "```\n",
    "- Format the collected json object into csv files containing tweet features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!/usr/bin/env python3\n",
    "\n",
    "# Hydration script: This script will walk through all the tweet id files and\n",
    "# hydrate them with twarc. The line oriented JSON files will\n",
    "# be placed right next to each tweet id file.\n",
    "\n",
    "# Note: you will need to install twarc, tqdm, and run twarc configure\n",
    "# from the command line to tell it your Twitter API keys (obtained by applying for Twitter developer account).\n",
    "\n",
    "import gzip\n",
    "import json\n",
    "\n",
    "from tqdm import tqdm\n",
    "from twarc import Twarc\n",
    "from pathlib import Path\n",
    "\n",
    "twarc = Twarc()\n",
    "data_dirs = ['path_to_dir_containing_tweetids_files']\n",
    "\n",
    "\n",
    "def main():\n",
    "    for data_dir in data_dirs:\n",
    "        for path in Path(data_dir).iterdir():\n",
    "            if path.name.endswith('.txt'):\n",
    "                hydrate(path)\n",
    "\n",
    "\n",
    "def _reader_generator(reader):\n",
    "    b = reader(1024 * 1024)\n",
    "    while b:\n",
    "        yield b\n",
    "        b = reader(1024 * 1024)\n",
    "\n",
    "\n",
    "def raw_newline_count(fname):\n",
    "    \"\"\"\n",
    "    Counts number of lines in file\n",
    "    \"\"\"\n",
    "    f = open(fname, 'rb')\n",
    "    f_gen = _reader_generator(f.raw.read)\n",
    "    return sum(buf.count(b'\\n') for buf in f_gen)\n",
    "\n",
    "\n",
    "def hydrate(id_file):\n",
    "    print('hydrating {}'.format(id_file))\n",
    "\n",
    "    gzip_path = id_file.with_suffix('.jsonl.gz')\n",
    "    if gzip_path.is_file():\n",
    "        print('skipping json file already exists: {}'.format(gzip_path))\n",
    "        return\n",
    "\n",
    "    num_ids = raw_newline_count(id_file)\n",
    "\n",
    "    with gzip.open(gzip_path, 'w') as output:\n",
    "        with tqdm(total=num_ids) as pbar:\n",
    "            for tweet in twarc.hydrate(id_file.open()):\n",
    "                output.write(json.dumps(tweet).encode('utf8') + b\"\\n\")\n",
    "                pbar.update(1)\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tweet features analysis (from formatted tweet csvs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "ddir = 'path_to_dir_containing_tweet_features_csv_from_hydrated_tweet_objects'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Description of required fields in tweet features csvs\n",
    "\"\"\"\n",
    "needed_columns = ['tweetid', 'corrected_tweet_type', 'hashtags', 'text']  # For text analysis\n",
    "# corrected_tweet_type = retweet/original/reply/quoted_tweet\n",
    "# (https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet)\n",
    "# hashtag = in the csv file provide a string formatted list of hashtags present in tweet object\n",
    "\n",
    "# (optional) additional_columns = ['userid', 'account_creation_date', 'urls', 'suspension_status']  \n",
    "# additional tweet information for disinformation urls, account creation dates, account suspensions \n",
    "# can be crawled separately from Twitter APIs.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tweet features of detected coordinated accounts\n",
    "adf_tweets = pd.read_csv(ddir + 'adf_tweets.csv', dtype=str, lineterminator='\\n')\n",
    "\n",
    "# tweet features of detected normal accounts (all accounts - detected coordinated set)\n",
    "ndf_tweets = pd.read_csv(ddir + 'ndf_tweets.csv', dtype=str, lineterminator='\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tweetid</th>\n",
       "      <th>corrected_tweet_type</th>\n",
       "      <th>hashtag</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1346975187011567616</td>\n",
       "      <td>retweet</td>\n",
       "      <td>[]</td>\n",
       "      <td>RT @MattHancock: Tonight, MPs approved the COV...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1346975209836994562</td>\n",
       "      <td>reply</td>\n",
       "      <td>[]</td>\n",
       "      <td>@BogochIsaac Great news..hoping J and J vaccin...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1346975218426900480</td>\n",
       "      <td>retweet</td>\n",
       "      <td>[]</td>\n",
       "      <td>RT @ElizabethSecon3: Obstetrician dies due to ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1346975224697221120</td>\n",
       "      <td>retweet</td>\n",
       "      <td>[]</td>\n",
       "      <td>RT @KevinStewartSNP: This kind of nonsense wit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1346975238295322626</td>\n",
       "      <td>retweet</td>\n",
       "      <td>[]</td>\n",
       "      <td>RT @msm_monitor: We've been monitoring BBC Sco...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>629958</th>\n",
       "      <td>1363999599673901057</td>\n",
       "      <td>retweet</td>\n",
       "      <td>[]</td>\n",
       "      <td>RT @thatsmanderley: This is troubling. https:/...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>629959</th>\n",
       "      <td>1363999650534031361</td>\n",
       "      <td>reply</td>\n",
       "      <td>[]</td>\n",
       "      <td>@PhilCochetti @covid_clarity Of course they ar...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>629960</th>\n",
       "      <td>1363999664186544131</td>\n",
       "      <td>original</td>\n",
       "      <td>[]</td>\n",
       "      <td>A very good read.  @BorisJohnson @MattHancock ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>629961</th>\n",
       "      <td>1363999671081967621</td>\n",
       "      <td>retweet</td>\n",
       "      <td>['LTC', 'COVID19Ontario']</td>\n",
       "      <td>RT @ruthmkb: 11 more Ontarians DIED! 4 of them...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>629962</th>\n",
       "      <td>1363999674550616065</td>\n",
       "      <td>retweet</td>\n",
       "      <td>[]</td>\n",
       "      <td>RT @SharpieDj: @themichaelcaine Anyone who tak...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>629963 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    tweetid corrected_tweet_type                    hashtag  \\\n",
       "0       1346975187011567616              retweet                         []   \n",
       "1       1346975209836994562                reply                         []   \n",
       "2       1346975218426900480              retweet                         []   \n",
       "3       1346975224697221120              retweet                         []   \n",
       "4       1346975238295322626              retweet                         []   \n",
       "...                     ...                  ...                        ...   \n",
       "629958  1363999599673901057              retweet                         []   \n",
       "629959  1363999650534031361                reply                         []   \n",
       "629960  1363999664186544131             original                         []   \n",
       "629961  1363999671081967621              retweet  ['LTC', 'COVID19Ontario']   \n",
       "629962  1363999674550616065              retweet                         []   \n",
       "\n",
       "                                                     text  \n",
       "0       RT @MattHancock: Tonight, MPs approved the COV...  \n",
       "1       @BogochIsaac Great news..hoping J and J vaccin...  \n",
       "2       RT @ElizabethSecon3: Obstetrician dies due to ...  \n",
       "3       RT @KevinStewartSNP: This kind of nonsense wit...  \n",
       "4       RT @msm_monitor: We've been monitoring BBC Sco...  \n",
       "...                                                   ...  \n",
       "629958  RT @thatsmanderley: This is troubling. https:/...  \n",
       "629959  @PhilCochetti @covid_clarity Of course they ar...  \n",
       "629960  A very good read.  @BorisJohnson @MattHancock ...  \n",
       "629961  RT @ruthmkb: 11 more Ontarians DIED! 4 of them...  \n",
       "629962  RT @SharpieDj: @themichaelcaine Anyone who tak...  \n",
       "\n",
       "[629963 rows x 4 columns]"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adf_tweets[['tweetid', 'corrected_tweet_type', 'hashtag', 'text']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "retweet         509168\n",
       "reply            52345\n",
       "quoted_tweet     44331\n",
       "original         24119\n",
       "Name: corrected_tweet_type, dtype: int64"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adf_tweets['corrected_tweet_type'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "def get_hashtag_dist_counts(adf_tweets, ndf_tweets):\n",
    "    hashtag_dists_counts = []\n",
    "    for ii, (grp_tweets, grp_name) in enumerate(zip([adf_tweets, ndf_tweets], ['coordinated', 'normal'])):\n",
    "        print('Group:', ii, grp_name)\n",
    "        # g[g['corrected_tweet_type']!='retweeted_tweet_without_comment'] # exclude retweets.\n",
    "        grp_tweets = grp_tweets[grp_tweets['corrected_tweet_type'] != 'retweet']\n",
    "        print(len(grp_tweets))\n",
    "        grp_hashtags = grp_tweets['hashtag'].dropna().apply(lambda x: json.loads(x.lower().replace(\"'\", '\"')))\n",
    "        print(len(grp_hashtags))\n",
    "        grp_flat_hashtags = []\n",
    "        for i, hash_list in grp_hashtags.iteritems():\n",
    "            if len(hash_list) > 0: \n",
    "                grp_flat_hashtags += hash_list\n",
    "        grp_flat_hashtags = pd.Series(grp_flat_hashtags)\n",
    "        # print(g_flat_hashtags.value_counts()[0:10])\n",
    "        hashtag_dists_counts.append(grp_flat_hashtags.value_counts())\n",
    "    return hashtag_dists_counts\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Group: 0 coordinated\n",
      "120795\n",
      "120795\n",
      "Group: 1 normal\n",
      "250750\n",
      "250750\n"
     ]
    }
   ],
   "source": [
    "hashtag_dists_counts = get_hashtag_dist_counts(adf_tweets, ndf_tweets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_unique_hashtags_in_the_groups(hashtag_dists_counts, topk_common=100, max_h=500):\n",
    "    # topk_common is the number of hashtags to consider from each list \n",
    "    # max_h is the num of hashatgs in each group to consider \n",
    "    # for filtering the topk_common and returning unique ones for the grp\n",
    "\n",
    "    topk = [0]*len(hashtag_dists_counts)\n",
    "    for ii in range(len(hashtag_dists_counts)): \n",
    "        topk[ii] = set(hashtag_dists_counts[ii][0:topk_common].index)\n",
    "        # First 100 hashtags in each group: print(topk[ii])\n",
    "\n",
    "    unique = [0]*len(hashtag_dists_counts)\n",
    "    for ii in range(len(hashtag_dists_counts)):\n",
    "        # print('=======ii============', ii)\n",
    "        unique[ii] = []\n",
    "        for h in hashtag_dists_counts[ii][0:max_h].items():\n",
    "            flag = 'add'\n",
    "            for jj in range(len(hashtag_dists_counts)):\n",
    "                if jj == ii: continue\n",
    "                if h[0] in topk[jj]: flag='skip'\n",
    "            if flag == 'add':\n",
    "                unique[ii].append(h)\n",
    "        # for i, j in enumerate(unique[ii][0:100]):\n",
    "        #     print(j)\n",
    "    return unique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Top 500 hashtags in each, retaining all hashtags including the common ones when topk_common=0\n",
    "unique = get_unique_hashtags_in_the_groups(hashtag_dists_counts, topk_common=0, max_h=500)\n",
    "df1 = pd.DataFrame(unique[0]).head(36)\n",
    "df2 = pd.DataFrame(unique[1]).head(36)\n",
    "df1.columns = ['Coordinated group', 'Vol. Hashtag']\n",
    "df2.columns = ['Normal group', 'Vol. Hashtag']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Coordinated group</th>\n",
       "      <th>Vol. Hashtag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>covid19</td>\n",
       "      <td>3096</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>vaccine</td>\n",
       "      <td>2563</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>cdnpoli</td>\n",
       "      <td>2443</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>covidvaccine</td>\n",
       "      <td>1219</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>auspol</td>\n",
       "      <td>888</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>onpoli</td>\n",
       "      <td>766</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>coronavirus</td>\n",
       "      <td>717</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>covid</td>\n",
       "      <td>713</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>vaccines</td>\n",
       "      <td>541</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>trudeauvaccinefail</td>\n",
       "      <td>363</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>pfizer</td>\n",
       "      <td>340</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>lockdown</td>\n",
       "      <td>309</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>trudeaufailedcanada</td>\n",
       "      <td>305</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>vaccination</td>\n",
       "      <td>279</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>bcpoli</td>\n",
       "      <td>254</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>trudeaumustgo</td>\n",
       "      <td>251</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>astrazeneca</td>\n",
       "      <td>246</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>ableg</td>\n",
       "      <td>231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>covid19vaccine</td>\n",
       "      <td>227</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>canada</td>\n",
       "      <td>222</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>abpoli</td>\n",
       "      <td>216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>cdnmedia</td>\n",
       "      <td>214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>scamdemic2020</td>\n",
       "      <td>213</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>trudeauvaccinefailure</td>\n",
       "      <td>198</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>masksdontwork</td>\n",
       "      <td>195</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>uk</td>\n",
       "      <td>193</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>notocoronavirusvaccines</td>\n",
       "      <td>190</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>plandemic2020</td>\n",
       "      <td>184</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>livingnotlockdown</td>\n",
       "      <td>183</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>lockdownchaos</td>\n",
       "      <td>183</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          Coordinated group  Vol. Hashtag\n",
       "0                   covid19          3096\n",
       "1                   vaccine          2563\n",
       "2                   cdnpoli          2443\n",
       "3              covidvaccine          1219\n",
       "4                    auspol           888\n",
       "5                    onpoli           766\n",
       "6               coronavirus           717\n",
       "7                     covid           713\n",
       "8                  vaccines           541\n",
       "9        trudeauvaccinefail           363\n",
       "10                   pfizer           340\n",
       "11                 lockdown           309\n",
       "12      trudeaufailedcanada           305\n",
       "13              vaccination           279\n",
       "14                   bcpoli           254\n",
       "15            trudeaumustgo           251\n",
       "16              astrazeneca           246\n",
       "17                    ableg           231\n",
       "18           covid19vaccine           227\n",
       "19                   canada           222\n",
       "20                   abpoli           216\n",
       "21                 cdnmedia           214\n",
       "22            scamdemic2020           213\n",
       "23    trudeauvaccinefailure           198\n",
       "24            masksdontwork           195\n",
       "25                       uk           193\n",
       "26  notocoronavirusvaccines           190\n",
       "27            plandemic2020           184\n",
       "28        livingnotlockdown           183\n",
       "29            lockdownchaos           183"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.head(30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Normal group</th>\n",
       "      <th>Vol. Hashtag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>covid19</td>\n",
       "      <td>11319</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>vaccine</td>\n",
       "      <td>7585</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>coronavirus</td>\n",
       "      <td>3674</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>covidvaccine</td>\n",
       "      <td>3431</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>covid</td>\n",
       "      <td>2230</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>staysafe</td>\n",
       "      <td>1873</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>smallbusiness</td>\n",
       "      <td>1857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>maskup</td>\n",
       "      <td>1853</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>washyourhands</td>\n",
       "      <td>1738</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>redbubble</td>\n",
       "      <td>1678</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>covid_19</td>\n",
       "      <td>1637</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>vaccines</td>\n",
       "      <td>1377</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>pfizer</td>\n",
       "      <td>1134</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>healthcare</td>\n",
       "      <td>1098</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>machinelearning</td>\n",
       "      <td>879</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>artificialintelligence</td>\n",
       "      <td>869</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>vaccination</td>\n",
       "      <td>829</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>astrazeneca</td>\n",
       "      <td>750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>largestvaccinedrive</td>\n",
       "      <td>612</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>covid19vaccine</td>\n",
       "      <td>572</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>wearamask</td>\n",
       "      <td>514</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>moderna</td>\n",
       "      <td>508</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>health</td>\n",
       "      <td>490</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>christmas</td>\n",
       "      <td>482</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>newyear</td>\n",
       "      <td>451</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>indiafightscorona</td>\n",
       "      <td>436</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>operationbreathefreshcleanair</td>\n",
       "      <td>427</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>uk</td>\n",
       "      <td>427</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>trump</td>\n",
       "      <td>414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>cdnpoli</td>\n",
       "      <td>389</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     Normal group  Vol. Hashtag\n",
       "0                         covid19         11319\n",
       "1                         vaccine          7585\n",
       "2                     coronavirus          3674\n",
       "3                    covidvaccine          3431\n",
       "4                           covid          2230\n",
       "5                        staysafe          1873\n",
       "6                   smallbusiness          1857\n",
       "7                          maskup          1853\n",
       "8                   washyourhands          1738\n",
       "9                       redbubble          1678\n",
       "10                       covid_19          1637\n",
       "11                       vaccines          1377\n",
       "12                         pfizer          1134\n",
       "13                     healthcare          1098\n",
       "14                machinelearning           879\n",
       "15         artificialintelligence           869\n",
       "16                    vaccination           829\n",
       "17                    astrazeneca           750\n",
       "18            largestvaccinedrive           612\n",
       "19                 covid19vaccine           572\n",
       "20                      wearamask           514\n",
       "21                        moderna           508\n",
       "22                         health           490\n",
       "23                      christmas           482\n",
       "24                        newyear           451\n",
       "25              indiafightscorona           436\n",
       "26  operationbreathefreshcleanair           427\n",
       "27                             uk           427\n",
       "28                          trump           414\n",
       "29                        cdnpoli           389"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2.head(30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'astrazeneca',\n",
       " 'auspol',\n",
       " 'cdnpoli',\n",
       " 'coronavirus',\n",
       " 'covid',\n",
       " 'covid19',\n",
       " 'covid19vaccine',\n",
       " 'covid_19',\n",
       " 'covidvaccine',\n",
       " 'pfizer',\n",
       " 'uk',\n",
       " 'vaccination',\n",
       " 'vaccine',\n",
       " 'vaccines'}"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "common = set(df1['Coordinated group'].values) & set(df2['Normal group'].values)\n",
    "common"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
