{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import os\n",
    "import pandas as pd\n",
    "from natsort import natsorted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             Id          Label  \\\n",
      "0      video100    family_kids   \n",
      "1      video102  news_politics   \n",
      "2      video103        cooking   \n",
      "3     video1030         sports   \n",
      "4     video1031  vehicles_auto   \n",
      "...         ...            ...   \n",
      "1840   video994        cooking   \n",
      "1841   video996   animals_pets   \n",
      "1842   video997    family_kids   \n",
      "1843   video302            NaN   \n",
      "1844   video304            NaN   \n",
      "\n",
      "                                               Concepts  \n",
      "0     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
      "1     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
      "2     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
      "3     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
      "4     [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...  \n",
      "...                                                 ...  \n",
      "1840  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
      "1841  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
      "1842  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
      "1843  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
      "1844  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
      "\n",
      "[1845 rows x 3 columns]\n"
     ]
    }
   ],
   "source": [
    "## Getting groung truth labels and concept vectors for training\n",
    "\n",
    "surveys_dir = './Extract_Concepts/surveys'\n",
    "\n",
    "surveys = [pd.read_csv(os.path.join(surveys_dir, f), skiprows=1, names=['Id', 'Label'], usecols=range(2))\n",
    "           for f in natsorted(os.listdir(surveys_dir)) if f.endswith('csv')]\n",
    "\n",
    "surveys_df = pd.concat(surveys, axis=0).reset_index(drop=True)\n",
    "\n",
    "aggregation_functions = {'Label': 'first'}\n",
    "survey_labels = surveys_df.groupby(surveys_df['Id']).aggregate(aggregation_functions)\n",
    "\n",
    "load_file = pd.read_pickle(r'20210930_lg_mpcg_ids_and_pam_by_cummulative_mi.pkl')\n",
    "concept_ids = pd.DataFrame({'Id':list(load_file['unique_ids']), 'Concepts':list(load_file['merged_pam'])})\n",
    "\n",
    "labels_df = pd.merge(survey_labels, concept_ids, on='Id', how='right')\n",
    "print(labels_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "sports                323\n",
       "music                 275\n",
       "cooking               237\n",
       "family_kids           234\n",
       "vehicles_auto         179\n",
       "news_politics         158\n",
       "science_technology    138\n",
       "beauty_fashion        121\n",
       "animals_pets          117\n",
       "eating_drinking        61\n",
       "Name: Label, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels_df['Label'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sports                323\n",
      "music                 275\n",
      "cooking               237\n",
      "family_kids           234\n",
      "vehicles_auto         179\n",
      "news_politics         158\n",
      "science_technology    138\n",
      "beauty_fashion        121\n",
      "animals_pets          117\n",
      "eating_drinking        61\n",
      "Name: Label, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "labels_df_filtered = labels_df[labels_df['Label']!='none']\n",
    "labels_df_filtered = labels_df_filtered.reset_index(drop=True)\n",
    "print(labels_df_filtered['Label'].value_counts())\n",
    "labels_df_filtered.to_pickle(\"labels_df_filtered.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Representative Concept based on frequency\n",
    "\n",
    "concepts = pd.read_csv('./Extract_Concepts/20210930_lg_concept_groupings_by_cummulative_mi.csv')\n",
    "max_concepts = concepts.sort_values('freq', ascending=False).drop_duplicates(['final_id']).sort_values('final_id')\n",
    "max_concepts = max_concepts[max_concepts['final_id']!=-1].reset_index(drop=True)\n",
    "max_concepts.to_csv('concepts_104.csv')\n",
    "\n",
    "\n",
    "## Map Video Ids and Survey Explanations\n",
    "\n",
    "ids = []\n",
    "labels = []\n",
    "explanations = []\n",
    "\n",
    "for f in natsorted(os.listdir(surveys_dir)):\n",
    "    if f.endswith('csv'):\n",
    "        survey = open(os.path.join(surveys_dir,f), \"r\")\n",
    "        txt = survey.readlines()\n",
    "        for i, line in enumerate(txt):\n",
    "            if i == 0:\n",
    "                head_tokens = line.split(',')\n",
    "                if head_tokens[0] != 'elapsed':\n",
    "                    raise ValueError(f\"Header not valid with:\\n\\t{head_tokens}\")\n",
    "                continue\n",
    "                \n",
    "            try:\n",
    "                line = line.strip()\n",
    "                line_tokens = line.split(',')\n",
    "                id_ = str(line_tokens[0])\n",
    "                label = line_tokens[1]\n",
    "                text = ','.join(line_tokens[2:]).strip()\n",
    "                ids.append(id_)\n",
    "                labels.append(label)\n",
    "                explanations.append(text)\n",
    "                \n",
    "            except:\n",
    "                print(f)\n",
    "\n",
    "df = pd.DataFrame(list(zip(ids, labels, explanations)),\n",
    "               columns =['Id', 'Label', 'Explanation'])\n",
    "\n",
    "aggregation_functions = {'Label': 'first', 'Explanation':'first'}\n",
    "survey_exp = df.groupby(df['Id']).aggregate(aggregation_functions)\n",
    "exp_df = pd.merge(survey_exp, concept_ids, on='Id', how='right')\n",
    "exp_df = exp_df[['Id','Explanation']]\n",
    "\n",
    "exp_df.to_pickle(\"Explanations_104.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>final_id</th>\n",
       "      <th>group_id</th>\n",
       "      <th>raw_id</th>\n",
       "      <th>freq</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1133</th>\n",
       "      <td>-1</td>\n",
       "      <td>673</td>\n",
       "      <td>962</td>\n",
       "      <td>1</td>\n",
       "      <td>some upbeat music plays</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>43</td>\n",
       "      <td>4</td>\n",
       "      <td>they are singing</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>204</th>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>1586</td>\n",
       "      <td>2</td>\n",
       "      <td>this video is cooking show</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>233</th>\n",
       "      <td>2</td>\n",
       "      <td>27</td>\n",
       "      <td>1651</td>\n",
       "      <td>1</td>\n",
       "      <td>the video shows a lot of children</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>242</th>\n",
       "      <td>3</td>\n",
       "      <td>142</td>\n",
       "      <td>709</td>\n",
       "      <td>3</td>\n",
       "      <td>two men wrestling</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>546</th>\n",
       "      <td>99</td>\n",
       "      <td>606</td>\n",
       "      <td>70</td>\n",
       "      <td>1</td>\n",
       "      <td>it displays highlight of a football match</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>549</th>\n",
       "      <td>100</td>\n",
       "      <td>0</td>\n",
       "      <td>1423</td>\n",
       "      <td>1</td>\n",
       "      <td>her playing music</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>551</th>\n",
       "      <td>101</td>\n",
       "      <td>15</td>\n",
       "      <td>1240</td>\n",
       "      <td>1</td>\n",
       "      <td>different players playing sports hitting and t...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>552</th>\n",
       "      <td>102</td>\n",
       "      <td>990</td>\n",
       "      <td>773</td>\n",
       "      <td>2</td>\n",
       "      <td>the video shows a show</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>554</th>\n",
       "      <td>103</td>\n",
       "      <td>292</td>\n",
       "      <td>1087</td>\n",
       "      <td>1</td>\n",
       "      <td>wrestling is sport</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>105 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      final_id  group_id  raw_id  freq  \\\n",
       "1133        -1       673     962     1   \n",
       "2            0        16      43     4   \n",
       "204          1         7    1586     2   \n",
       "233          2        27    1651     1   \n",
       "242          3       142     709     3   \n",
       "...        ...       ...     ...   ...   \n",
       "546         99       606      70     1   \n",
       "549        100         0    1423     1   \n",
       "551        101        15    1240     1   \n",
       "552        102       990     773     2   \n",
       "554        103       292    1087     1   \n",
       "\n",
       "                                                   text  \n",
       "1133                            some upbeat music plays  \n",
       "2                                      they are singing  \n",
       "204                          this video is cooking show  \n",
       "233                   the video shows a lot of children  \n",
       "242                                   two men wrestling  \n",
       "...                                                 ...  \n",
       "546           it displays highlight of a football match  \n",
       "549                                   her playing music  \n",
       "551   different players playing sports hitting and t...  \n",
       "552                              the video shows a show  \n",
       "554                                  wrestling is sport  \n",
       "\n",
       "[105 rows x 5 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "concepts = pd.read_csv('./Extract_Concepts/20210930_lg_concept_groupings_by_cummulative_mi.csv')\n",
    "max_concepts = concepts.sort_values('freq', ascending=False).drop_duplicates(['final_id']).sort_values('final_id')\n",
    "max_concepts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>final_id</th>\n",
       "      <th>group_id</th>\n",
       "      <th>raw_id</th>\n",
       "      <th>freq</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>351</th>\n",
       "      <td>18</td>\n",
       "      <td>694</td>\n",
       "      <td>387</td>\n",
       "      <td>1</td>\n",
       "      <td>it is</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     final_id  group_id  raw_id  freq   text\n",
       "351        18       694     387     1  it is"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "concepts[concepts['final_id'] == 18]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>Label</th>\n",
       "      <th>Concepts</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>video100</td>\n",
       "      <td>family_kids</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>video102</td>\n",
       "      <td>news_politics</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>video103</td>\n",
       "      <td>cooking</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>video1030</td>\n",
       "      <td>sports</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>video1031</td>\n",
       "      <td>vehicles_auto</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1840</th>\n",
       "      <td>video994</td>\n",
       "      <td>cooking</td>\n",
       "      <td>[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1841</th>\n",
       "      <td>video996</td>\n",
       "      <td>animals_pets</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1842</th>\n",
       "      <td>video997</td>\n",
       "      <td>family_kids</td>\n",
       "      <td>[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1843</th>\n",
       "      <td>video302</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1844</th>\n",
       "      <td>video304</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1845 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             Id          Label  \\\n",
       "0      video100    family_kids   \n",
       "1      video102  news_politics   \n",
       "2      video103        cooking   \n",
       "3     video1030         sports   \n",
       "4     video1031  vehicles_auto   \n",
       "...         ...            ...   \n",
       "1840   video994        cooking   \n",
       "1841   video996   animals_pets   \n",
       "1842   video997    family_kids   \n",
       "1843   video302            NaN   \n",
       "1844   video304            NaN   \n",
       "\n",
       "                                               Concepts  \n",
       "0     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
       "1     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
       "2     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
       "3     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
       "4     [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...  \n",
       "...                                                 ...  \n",
       "1840  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
       "1841  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
       "1842  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
       "1843  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
       "1844  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
       "\n",
       "[1845 rows x 3 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels_df_filtered"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>Explanation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>video100</td>\n",
       "      <td>Because it looks like a kids show.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>video102</td>\n",
       "      <td>Because its talking about plastic.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>video103</td>\n",
       "      <td>Because at the end she says happy cooking.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>video1030</td>\n",
       "      <td>it is a football commentary team</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>video1031</td>\n",
       "      <td>it is a car driving video i like very much car...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1840</th>\n",
       "      <td>video991</td>\n",
       "      <td>IT WAS A MUSIC VIDEO OF THE CHILDREN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1841</th>\n",
       "      <td>video993</td>\n",
       "      <td>HE IS EXPLAINING THE MOBILE PHONE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1842</th>\n",
       "      <td>video994</td>\n",
       "      <td>IT WAS THE COOKING VIDEO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1843</th>\n",
       "      <td>video996</td>\n",
       "      <td>IT WAS THE SPIDER</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1844</th>\n",
       "      <td>video997</td>\n",
       "      <td>IT WAS THE KIDS PLAYING THE BIRD</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1845 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             Id                                        Explanation\n",
       "0      video100                 Because it looks like a kids show.\n",
       "1      video102                 Because its talking about plastic.\n",
       "2      video103         Because at the end she says happy cooking.\n",
       "3     video1030                   it is a football commentary team\n",
       "4     video1031  it is a car driving video i like very much car...\n",
       "...         ...                                                ...\n",
       "1840   video991               IT WAS A MUSIC VIDEO OF THE CHILDREN\n",
       "1841   video993                  HE IS EXPLAINING THE MOBILE PHONE\n",
       "1842   video994                           IT WAS THE COOKING VIDEO\n",
       "1843   video996                                  IT WAS THE SPIDER\n",
       "1844   video997                   IT WAS THE KIDS PLAYING THE BIRD\n",
       "\n",
       "[1845 rows x 2 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "exp_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "cooking               42\n",
       "sports                35\n",
       "family_kids           31\n",
       "music                 30\n",
       "vehicles_auto         22\n",
       "news_politics         20\n",
       "animals_pets          17\n",
       "science_technology    17\n",
       "beauty_fashion        16\n",
       "eating_drinking       13\n",
       "Name: Label, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels_df_filtered['Label'][1600:].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels_df_filtered.to_pickle(\"labels_df_filtered.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'keep_ids' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-2-cc8e5ff19346>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlistdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'vids'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m     \u001b[0;32mif\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkeep_ids\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m         \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mremove\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'vids'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'keep_ids' is not defined"
     ]
    }
   ],
   "source": [
    "i = 0\n",
    "for f in os.listdir('vids'):\n",
    "    if f.split('.')[0] not in keep_ids:\n",
    "        os.remove(os.path.join('vids',f))\n",
    "        "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
