{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "eeb86600-dd40-4bfe-87c6-5ec23e96f162",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-08-19 01:17:25.854364: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import torch\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import gensim\n",
    "import ast\n",
    "import umap\n",
    "from gensim.models.doc2vec import TaggedDocument\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.metrics import adjusted_rand_score\n",
    "from sklearn.manifold import TSNE\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import csv\n",
    "from scipy.spatial import distance\n",
    "from scipy.spatial.distance import cityblock\n",
    "from torch import nn\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "from torchvision import transforms, utils\n",
    "from nltk.stem.porter import *\n",
    "import torch.nn.functional as F\n",
    "import pickle\n",
    "import random\n",
    "import math\n",
    "from tqdm import tqdm\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a2daba22-0825-4c69-9a69-cbfcd0a2bcb3",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0.5</th>\n",
       "      <th>Unnamed: 0.4</th>\n",
       "      <th>Unnamed: 0.3</th>\n",
       "      <th>Unnamed: 0.2</th>\n",
       "      <th>Unnamed: 0.1</th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Recipe_id</th>\n",
       "      <th>Region</th>\n",
       "      <th>Sub_region</th>\n",
       "      <th>Continent</th>\n",
       "      <th>ingredients</th>\n",
       "      <th>Instructions</th>\n",
       "      <th>Ingredient_List</th>\n",
       "      <th>Ing List Sent</th>\n",
       "      <th>Ing Inst</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3697</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['dungeness crab', 'turmeric', 'salt', 'mustar...</td>\n",
       "      <td>rub the crabs with  teaspoon of the turmeric a...</td>\n",
       "      <td>['Indian_Subcontinent', 'dungeness_crab', 'tur...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3698</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['tomato', 'cumin', 'turmeric', 'salt', 'water...</td>\n",
       "      <td>bring the tomatoes  cumin  turmeric  salt  and...</td>\n",
       "      <td>['Indian_Subcontinent', 'tomato', 'cumin', 'tu...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>3699</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['vegetable oil', 'onion', 'tomato', 'green ch...</td>\n",
       "      <td>heat  tablespoon oil in a skillet over medium ...</td>\n",
       "      <td>['Indian_Subcontinent', 'vegetable_oil', 'onio...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3700</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['lamb shoulder', 'garam masala', 'salt', 'but...</td>\n",
       "      <td>season the lamb with garam_masala and salt   h...</td>\n",
       "      <td>['Indian_Subcontinent', 'lamb_shoulder', 'gara...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>3701</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['potato', 'pea', 'vegetable oil', 'cumin seed...</td>\n",
       "      <td>bring a medium saucepan of lightly salted wate...</td>\n",
       "      <td>['Indian_Subcontinent', 'potato', 'pea', 'vege...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0.5  Unnamed: 0.4  Unnamed: 0.3  Unnamed: 0.2  Unnamed: 0.1  \\\n",
       "0             0             0             0             0             0   \n",
       "1             1             1             1             1             1   \n",
       "2             2             2             2             2             2   \n",
       "3             3             3             3             3             3   \n",
       "4             4             4             4             4             4   \n",
       "\n",
       "   Unnamed: 0  Recipe_id               Region   Sub_region Continent  \\\n",
       "0           0       3697  Indian Subcontinent  Bangladeshi     Asian   \n",
       "1           1       3698  Indian Subcontinent  Bangladeshi     Asian   \n",
       "2           2       3699  Indian Subcontinent  Bangladeshi     Asian   \n",
       "3           3       3700  Indian Subcontinent  Bangladeshi     Asian   \n",
       "4           4       3701  Indian Subcontinent  Bangladeshi     Asian   \n",
       "\n",
       "                                         ingredients  \\\n",
       "0  ['dungeness crab', 'turmeric', 'salt', 'mustar...   \n",
       "1  ['tomato', 'cumin', 'turmeric', 'salt', 'water...   \n",
       "2  ['vegetable oil', 'onion', 'tomato', 'green ch...   \n",
       "3  ['lamb shoulder', 'garam masala', 'salt', 'but...   \n",
       "4  ['potato', 'pea', 'vegetable oil', 'cumin seed...   \n",
       "\n",
       "                                        Instructions  \\\n",
       "0  rub the crabs with  teaspoon of the turmeric a...   \n",
       "1  bring the tomatoes  cumin  turmeric  salt  and...   \n",
       "2  heat  tablespoon oil in a skillet over medium ...   \n",
       "3  season the lamb with garam_masala and salt   h...   \n",
       "4  bring a medium saucepan of lightly salted wate...   \n",
       "\n",
       "                                     Ingredient_List  \\\n",
       "0  ['Indian_Subcontinent', 'dungeness_crab', 'tur...   \n",
       "1  ['Indian_Subcontinent', 'tomato', 'cumin', 'tu...   \n",
       "2  ['Indian_Subcontinent', 'vegetable_oil', 'onio...   \n",
       "3  ['Indian_Subcontinent', 'lamb_shoulder', 'gara...   \n",
       "4  ['Indian_Subcontinent', 'potato', 'pea', 'vege...   \n",
       "\n",
       "                                       Ing List Sent  \\\n",
       "0  This recipe from Indian_Subcontinent cuisine c...   \n",
       "1  This recipe from Indian_Subcontinent cuisine c...   \n",
       "2  This recipe from Indian_Subcontinent cuisine c...   \n",
       "3  This recipe from Indian_Subcontinent cuisine c...   \n",
       "4  This recipe from Indian_Subcontinent cuisine c...   \n",
       "\n",
       "                                            Ing Inst  \n",
       "0  This recipe from Indian_Subcontinent cuisine c...  \n",
       "1  This recipe from Indian_Subcontinent cuisine c...  \n",
       "2  This recipe from Indian_Subcontinent cuisine c...  \n",
       "3  This recipe from Indian_Subcontinent cuisine c...  \n",
       "4  This recipe from Indian_Subcontinent cuisine c...  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df= pd.read_csv(\"Recipe DB Modified for BERT.csv\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3d0a34dd-7f24-4ba8-8307-1f56f10329d8",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Unnamed: 0.5', 'Unnamed: 0.4', 'Unnamed: 0.3', 'Unnamed: 0.2',\n",
       "       'Unnamed: 0.1', 'Unnamed: 0', 'Recipe_id', 'Region', 'Sub_region',\n",
       "       'Continent', 'ingredients', 'Instructions', 'Ingredient_List',\n",
       "       'Ing List Sent', 'Ing Inst'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ce293320-ad43-4eef-b001-d8be933bd85b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "#Removing Short Words: Words\n",
    "\n",
    "df['tidy_ing_inst']=df['Ing Inst'].apply(lambda x:' '.join([w for w in x.split() if len(w)> 3]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "4f4e3bca-3956-467c-903c-ad0a260eb2cc",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'This recipe from Indian_Subcontinent cuisine contains dungeness_crab turmeric salt mustard_seed water mustard_oil red_onion potato_lengthwise clove cinnamon green_cardamom_pod black_peppercorn tomato thai_green_chile garlic_paste ginger_paste cayenne_pepper white_sugar garnish wedge_lemon cilantro ingredients crabs with teaspoon turmeric teaspoon salt them marinate hour combine mustard_seed hot_water small bowl stand minutes mortar pestle grind seeds into coarse paste heat kadhai over medium heat crabs stir until they change color about minutes remove crabs from aside sliced onions cook stir over medium heat until onions translucent about minutes raise heat high potatoes cook stirring constantly about minutes cloves cinnamon stick cardamom_pods peppercorns stir thirty seconds stir tomatoes ginger_paste garlic_paste halve three chiles them cook stir additional minute over high heat reduce heat medium remaining teaspoon turmeric cayenne_pepper mustard_paste stir combine crabs pour just enough water cover vegetables bring water boil stir sugar salt taste cover reduce heat simmer until potatoes tender water reduced half about minutes remove stir simmer until gravy thickened about minutes more squeeze lemon_wedge over finished dish garnish with chopped cilantro sliced green_chile serve with rice'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['tidy_ing_inst'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "36e434b1-7909-44aa-a9d8-b6a68cb444e2",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    [This, recipe, from, Indian_Subcontinent, cuis...\n",
       "1    [This, recipe, from, Indian_Subcontinent, cuis...\n",
       "2    [This, recipe, from, Indian_Subcontinent, cuis...\n",
       "3    [This, recipe, from, Indian_Subcontinent, cuis...\n",
       "4    [This, recipe, from, Indian_Subcontinent, cuis...\n",
       "Name: tidy_ing_inst, dtype: object"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Tokenization\n",
    "\n",
    "tokenized_txt= df['tidy_ing_inst'].apply(lambda x: x.split())\n",
    "tokenized_txt.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "f8b22a70-8d07-4ef7-8df5-6fae022ed9cf",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0.5</th>\n",
       "      <th>Unnamed: 0.4</th>\n",
       "      <th>Unnamed: 0.3</th>\n",
       "      <th>Unnamed: 0.2</th>\n",
       "      <th>Unnamed: 0.1</th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Recipe_id</th>\n",
       "      <th>Region</th>\n",
       "      <th>Sub_region</th>\n",
       "      <th>Continent</th>\n",
       "      <th>ingredients</th>\n",
       "      <th>Instructions</th>\n",
       "      <th>Ingredient_List</th>\n",
       "      <th>Ing List Sent</th>\n",
       "      <th>Ing Inst</th>\n",
       "      <th>tidy_ing_inst</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3697</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['dungeness crab', 'turmeric', 'salt', 'mustar...</td>\n",
       "      <td>rub the crabs with  teaspoon of the turmeric a...</td>\n",
       "      <td>['Indian_Subcontinent', 'dungeness_crab', 'tur...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3698</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['tomato', 'cumin', 'turmeric', 'salt', 'water...</td>\n",
       "      <td>bring the tomatoes  cumin  turmeric  salt  and...</td>\n",
       "      <td>['Indian_Subcontinent', 'tomato', 'cumin', 'tu...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>3699</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['vegetable oil', 'onion', 'tomato', 'green ch...</td>\n",
       "      <td>heat  tablespoon oil in a skillet over medium ...</td>\n",
       "      <td>['Indian_Subcontinent', 'vegetable_oil', 'onio...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3700</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['lamb shoulder', 'garam masala', 'salt', 'but...</td>\n",
       "      <td>season the lamb with garam_masala and salt   h...</td>\n",
       "      <td>['Indian_Subcontinent', 'lamb_shoulder', 'gara...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>3701</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['potato', 'pea', 'vegetable oil', 'cumin seed...</td>\n",
       "      <td>bring a medium saucepan of lightly salted wate...</td>\n",
       "      <td>['Indian_Subcontinent', 'potato', 'pea', 'vege...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0.5  Unnamed: 0.4  Unnamed: 0.3  Unnamed: 0.2  Unnamed: 0.1  \\\n",
       "0             0             0             0             0             0   \n",
       "1             1             1             1             1             1   \n",
       "2             2             2             2             2             2   \n",
       "3             3             3             3             3             3   \n",
       "4             4             4             4             4             4   \n",
       "\n",
       "   Unnamed: 0  Recipe_id               Region   Sub_region Continent  \\\n",
       "0           0       3697  Indian Subcontinent  Bangladeshi     Asian   \n",
       "1           1       3698  Indian Subcontinent  Bangladeshi     Asian   \n",
       "2           2       3699  Indian Subcontinent  Bangladeshi     Asian   \n",
       "3           3       3700  Indian Subcontinent  Bangladeshi     Asian   \n",
       "4           4       3701  Indian Subcontinent  Bangladeshi     Asian   \n",
       "\n",
       "                                         ingredients  \\\n",
       "0  ['dungeness crab', 'turmeric', 'salt', 'mustar...   \n",
       "1  ['tomato', 'cumin', 'turmeric', 'salt', 'water...   \n",
       "2  ['vegetable oil', 'onion', 'tomato', 'green ch...   \n",
       "3  ['lamb shoulder', 'garam masala', 'salt', 'but...   \n",
       "4  ['potato', 'pea', 'vegetable oil', 'cumin seed...   \n",
       "\n",
       "                                        Instructions  \\\n",
       "0  rub the crabs with  teaspoon of the turmeric a...   \n",
       "1  bring the tomatoes  cumin  turmeric  salt  and...   \n",
       "2  heat  tablespoon oil in a skillet over medium ...   \n",
       "3  season the lamb with garam_masala and salt   h...   \n",
       "4  bring a medium saucepan of lightly salted wate...   \n",
       "\n",
       "                                     Ingredient_List  \\\n",
       "0  ['Indian_Subcontinent', 'dungeness_crab', 'tur...   \n",
       "1  ['Indian_Subcontinent', 'tomato', 'cumin', 'tu...   \n",
       "2  ['Indian_Subcontinent', 'vegetable_oil', 'onio...   \n",
       "3  ['Indian_Subcontinent', 'lamb_shoulder', 'gara...   \n",
       "4  ['Indian_Subcontinent', 'potato', 'pea', 'vege...   \n",
       "\n",
       "                                       Ing List Sent  \\\n",
       "0  This recipe from Indian_Subcontinent cuisine c...   \n",
       "1  This recipe from Indian_Subcontinent cuisine c...   \n",
       "2  This recipe from Indian_Subcontinent cuisine c...   \n",
       "3  This recipe from Indian_Subcontinent cuisine c...   \n",
       "4  This recipe from Indian_Subcontinent cuisine c...   \n",
       "\n",
       "                                            Ing Inst  \\\n",
       "0  This recipe from Indian_Subcontinent cuisine c...   \n",
       "1  This recipe from Indian_Subcontinent cuisine c...   \n",
       "2  This recipe from Indian_Subcontinent cuisine c...   \n",
       "3  This recipe from Indian_Subcontinent cuisine c...   \n",
       "4  This recipe from Indian_Subcontinent cuisine c...   \n",
       "\n",
       "                                       tidy_ing_inst  \n",
       "0  This recipe from Indian_Subcontinent cuisine c...  \n",
       "1  This recipe from Indian_Subcontinent cuisine c...  \n",
       "2  This recipe from Indian_Subcontinent cuisine c...  \n",
       "3  This recipe from Indian_Subcontinent cuisine c...  \n",
       "4  This recipe from Indian_Subcontinent cuisine c...  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Merging the tokens back together\n",
    "\n",
    "for i in range(len(tokenized_txt)):\n",
    "    tokenized_txt[i]= ' '.join(tokenized_txt[i])\n",
    "df['tidy_ing_inst']= tokenized_txt\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "e0d39855-f1b3-4654-9b1d-bd5b3aa2e31c",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0.5</th>\n",
       "      <th>Unnamed: 0.4</th>\n",
       "      <th>Unnamed: 0.3</th>\n",
       "      <th>Unnamed: 0.2</th>\n",
       "      <th>Unnamed: 0.1</th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Recipe_id</th>\n",
       "      <th>Region</th>\n",
       "      <th>Sub_region</th>\n",
       "      <th>Continent</th>\n",
       "      <th>ingredients</th>\n",
       "      <th>Instructions</th>\n",
       "      <th>Ingredient_List</th>\n",
       "      <th>Ing List Sent</th>\n",
       "      <th>Ing Inst</th>\n",
       "      <th>tidy_ing_inst</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3697</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['dungeness crab', 'turmeric', 'salt', 'mustar...</td>\n",
       "      <td>rub the crabs with  teaspoon of the turmeric a...</td>\n",
       "      <td>['Indian_Subcontinent', 'dungeness_crab', 'tur...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3698</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['tomato', 'cumin', 'turmeric', 'salt', 'water...</td>\n",
       "      <td>bring the tomatoes  cumin  turmeric  salt  and...</td>\n",
       "      <td>['Indian_Subcontinent', 'tomato', 'cumin', 'tu...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>3699</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['vegetable oil', 'onion', 'tomato', 'green ch...</td>\n",
       "      <td>heat  tablespoon oil in a skillet over medium ...</td>\n",
       "      <td>['Indian_Subcontinent', 'vegetable_oil', 'onio...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3700</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['lamb shoulder', 'garam masala', 'salt', 'but...</td>\n",
       "      <td>season the lamb with garam_masala and salt   h...</td>\n",
       "      <td>['Indian_Subcontinent', 'lamb_shoulder', 'gara...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>3701</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['potato', 'pea', 'vegetable oil', 'cumin seed...</td>\n",
       "      <td>bring a medium saucepan of lightly salted wate...</td>\n",
       "      <td>['Indian_Subcontinent', 'potato', 'pea', 'vege...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0.5  Unnamed: 0.4  Unnamed: 0.3  Unnamed: 0.2  Unnamed: 0.1  \\\n",
       "0             0             0             0             0             0   \n",
       "1             1             1             1             1             1   \n",
       "2             2             2             2             2             2   \n",
       "3             3             3             3             3             3   \n",
       "4             4             4             4             4             4   \n",
       "\n",
       "   Unnamed: 0  Recipe_id               Region   Sub_region Continent  \\\n",
       "0           0       3697  Indian Subcontinent  Bangladeshi     Asian   \n",
       "1           1       3698  Indian Subcontinent  Bangladeshi     Asian   \n",
       "2           2       3699  Indian Subcontinent  Bangladeshi     Asian   \n",
       "3           3       3700  Indian Subcontinent  Bangladeshi     Asian   \n",
       "4           4       3701  Indian Subcontinent  Bangladeshi     Asian   \n",
       "\n",
       "                                         ingredients  \\\n",
       "0  ['dungeness crab', 'turmeric', 'salt', 'mustar...   \n",
       "1  ['tomato', 'cumin', 'turmeric', 'salt', 'water...   \n",
       "2  ['vegetable oil', 'onion', 'tomato', 'green ch...   \n",
       "3  ['lamb shoulder', 'garam masala', 'salt', 'but...   \n",
       "4  ['potato', 'pea', 'vegetable oil', 'cumin seed...   \n",
       "\n",
       "                                        Instructions  \\\n",
       "0  rub the crabs with  teaspoon of the turmeric a...   \n",
       "1  bring the tomatoes  cumin  turmeric  salt  and...   \n",
       "2  heat  tablespoon oil in a skillet over medium ...   \n",
       "3  season the lamb with garam_masala and salt   h...   \n",
       "4  bring a medium saucepan of lightly salted wate...   \n",
       "\n",
       "                                     Ingredient_List  \\\n",
       "0  ['Indian_Subcontinent', 'dungeness_crab', 'tur...   \n",
       "1  ['Indian_Subcontinent', 'tomato', 'cumin', 'tu...   \n",
       "2  ['Indian_Subcontinent', 'vegetable_oil', 'onio...   \n",
       "3  ['Indian_Subcontinent', 'lamb_shoulder', 'gara...   \n",
       "4  ['Indian_Subcontinent', 'potato', 'pea', 'vege...   \n",
       "\n",
       "                                       Ing List Sent  \\\n",
       "0  This recipe from Indian_Subcontinent cuisine c...   \n",
       "1  This recipe from Indian_Subcontinent cuisine c...   \n",
       "2  This recipe from Indian_Subcontinent cuisine c...   \n",
       "3  This recipe from Indian_Subcontinent cuisine c...   \n",
       "4  This recipe from Indian_Subcontinent cuisine c...   \n",
       "\n",
       "                                            Ing Inst  \\\n",
       "0  This recipe from Indian_Subcontinent cuisine c...   \n",
       "1  This recipe from Indian_Subcontinent cuisine c...   \n",
       "2  This recipe from Indian_Subcontinent cuisine c...   \n",
       "3  This recipe from Indian_Subcontinent cuisine c...   \n",
       "4  This recipe from Indian_Subcontinent cuisine c...   \n",
       "\n",
       "                                       tidy_ing_inst  \n",
       "0  This recipe from Indian_Subcontinent cuisine c...  \n",
       "1  This recipe from Indian_Subcontinent cuisine c...  \n",
       "2  This recipe from Indian_Subcontinent cuisine c...  \n",
       "3  This recipe from Indian_Subcontinent cuisine c...  \n",
       "4  This recipe from Indian_Subcontinent cuisine c...  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "ca1c9ec7-9216-46e8-b7b3-760ece273037",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    Unnamed: 0.5  Unnamed: 0.4  Unnamed: 0.3  Unnamed: 0.2  Unnamed: 0.1  \\\n",
      "0              0             0             0             0             0   \n",
      "1              1             1             1             1             1   \n",
      "2              2             2             2             2             2   \n",
      "3              3             3             3             3             3   \n",
      "4              4             4             4             4             4   \n",
      "5            731           731           731           731           731   \n",
      "6            732           732           732           732           732   \n",
      "7            733           733           733           733           733   \n",
      "8            734           734           734           734           734   \n",
      "9            735           735           735           735           735   \n",
      "10          3005          3005          3005          3005          3005   \n",
      "11          3006          3006          3006          3006          3006   \n",
      "12          3007          3007          3007          3007          3007   \n",
      "13          3008          3008          3008          3008          3008   \n",
      "14          3009          3009          3009          3009          3009   \n",
      "15          3230          3230          3230          3230          3230   \n",
      "16          3231          3231          3231          3231          3231   \n",
      "17          3232          3232          3232          3232          3232   \n",
      "18          3233          3233          3233          3233          3233   \n",
      "19          3234          3234          3234          3234          3234   \n",
      "20          5749          5749          5749          5749          5749   \n",
      "21          5750          5750          5750          5750          5750   \n",
      "22          5751          5751          5751          5751          5751   \n",
      "23          5752          5752          5752          5752          5752   \n",
      "24          5753          5753          5753          5753          5753   \n",
      "\n",
      "    Unnamed: 0  Recipe_id               Region   Sub_region       Continent  \\\n",
      "0            0       3697  Indian Subcontinent  Bangladeshi           Asian   \n",
      "1            1       3698  Indian Subcontinent  Bangladeshi           Asian   \n",
      "2            2       3699  Indian Subcontinent  Bangladeshi           Asian   \n",
      "3            3       3700  Indian Subcontinent  Bangladeshi           Asian   \n",
      "4            4       3701  Indian Subcontinent  Bangladeshi           Asian   \n",
      "5          731       5704              Mexican      Mexican  Latin American   \n",
      "6          732       5705              Mexican      Mexican  Latin American   \n",
      "7          733       5706              Mexican      Mexican  Latin American   \n",
      "8          734       5707              Mexican      Mexican  Latin American   \n",
      "9          735       5708              Mexican      Mexican  Latin American   \n",
      "10        3005       8291       South American    Argentine  Latin American   \n",
      "11        3006       8292       South American    Argentine  Latin American   \n",
      "12        3007       8293       South American    Argentine  Latin American   \n",
      "13        3008       8294       South American    Argentine  Latin American   \n",
      "14        3009       8295       South American    Argentine  Latin American   \n",
      "15        3230      10218              Italian      Italian        European   \n",
      "16        3231      10219              Italian      Italian        European   \n",
      "17        3232      10220              Italian      Italian        European   \n",
      "18        3233      10221              Italian      Italian        European   \n",
      "19        3234      10222              Italian      Italian        European   \n",
      "20        5749      18754             Canadian     Canadian  North American   \n",
      "21        5750      18755             Canadian     Canadian  North American   \n",
      "22        5751      18756             Canadian     Canadian  North American   \n",
      "23        5752      18757             Canadian     Canadian  North American   \n",
      "24        5753      18758             Canadian     Canadian  North American   \n",
      "\n",
      "                                          ingredients  \\\n",
      "0   ['dungeness crab', 'turmeric', 'salt', 'mustar...   \n",
      "1   ['tomato', 'cumin', 'turmeric', 'salt', 'water...   \n",
      "2   ['vegetable oil', 'onion', 'tomato', 'green ch...   \n",
      "3   ['lamb shoulder', 'garam masala', 'salt', 'but...   \n",
      "4   ['potato', 'pea', 'vegetable oil', 'cumin seed...   \n",
      "5   ['cumin', 'chili powder', 'buttery', 'tilapia ...   \n",
      "6   ['mango salsa', 'mango', 'avocado', 'tomato', ...   \n",
      "7   ['shrimp', 'cream cheese', 'garlic', 'cilantro...   \n",
      "8   ['canola oil', 'acorn squash', 'red onion', 's...   \n",
      "9   ['butter', 'olive oil', 'parmesan cheese', 'it...   \n",
      "10      ['egg', 'peel lime', 'milk', 'rum', 'nutmeg']   \n",
      "11  ['milk', 'vanilla extract', 'purpose flour', '...   \n",
      "12        ['plantain', 'butter', 'mozzarella cheese']   \n",
      "13  ['oil', 'plantain', 'egg', 'white sugar', 'van...   \n",
      "14  ['olive oil', 'onion', 'celery', 'carrot', 'qu...   \n",
      "15  ['extra beef', 'egg', 'breadcrumb', 'parmesan ...   \n",
      "16  ['olive oil', 'garlic', 'onion', 'celery', 'ca...   \n",
      "17  ['olive oil', 'onion', 'garlic', 'red bell pep...   \n",
      "18  ['garlic', 'onion', 'olive oil', 'tomato', 'wa...   \n",
      "19  ['butter', 'leek lengthwise', 'rosemary', 'gar...   \n",
      "20  ['pearl barley', 'water', 'canola oil', 'onion...   \n",
      "21  ['molasses', 'brown sugar', 'ketchup', 'lemon ...   \n",
      "22  ['water', 'maple syrup', 'cranberry', 'brown s...   \n",
      "23  ['purpose flour', 'baking powder', 'ginger', '...   \n",
      "24  ['purpose flour', 'chili powder', 'onion', 'sa...   \n",
      "\n",
      "                                         Instructions  \\\n",
      "0   rub the crabs with  teaspoon of the turmeric a...   \n",
      "1   bring the tomatoes  cumin  turmeric  salt  and...   \n",
      "2   heat  tablespoon oil in a skillet over medium ...   \n",
      "3   season the lamb with garam_masala and salt   h...   \n",
      "4   bring a medium saucepan of lightly salted wate...   \n",
      "5   mix cumin and chili_powder together and sprink...   \n",
      "6   stir mango  avocado  tomato  red_onion  red_pe...   \n",
      "7   preheat oven to  degrees f   stir shrimp  crea...   \n",
      "8   heat canola_oil in a deep fryer or heavy pot o...   \n",
      "9   preheat an outdoor grill for medium high heat ...   \n",
      "10  whisk eggs and lime_peel together in a large b...   \n",
      "11  to make filling  pour the condensed milk into ...   \n",
      "12  preheat oven to  degrees f   line a baking she...   \n",
      "13  heat about  inch of oil in a large  heavy skil...   \n",
      "14  pour oil into a medium saucepan  and place ove...   \n",
      "15  mix together the meat  egg  bread_crumbs  chee...   \n",
      "16  in a large stock pot  over medium low heat  he...   \n",
      "17  in a large pot over medium high heat  combine ...   \n",
      "18  in a large pot over medium heat  saute the gar...   \n",
      "19  melt butter in a large saucepan over medium he...   \n",
      "20  combine pearl_barley and boiling water in a bo...   \n",
      "21  combine the molasses  brown_sugar  ketchup  le...   \n",
      "22  combine water and maple_syrup in a saucepan  b...   \n",
      "23  preheat oven to  degrees f   lightly grease  l...   \n",
      "24  mix flour  chili_powder  dried minced onion  s...   \n",
      "\n",
      "                                      Ingredient_List  \\\n",
      "0   ['Indian_Subcontinent', 'dungeness_crab', 'tur...   \n",
      "1   ['Indian_Subcontinent', 'tomato', 'cumin', 'tu...   \n",
      "2   ['Indian_Subcontinent', 'vegetable_oil', 'onio...   \n",
      "3   ['Indian_Subcontinent', 'lamb_shoulder', 'gara...   \n",
      "4   ['Indian_Subcontinent', 'potato', 'pea', 'vege...   \n",
      "5   ['Mexican', 'cumin', 'chili_powder', 'buttery'...   \n",
      "6   ['Mexican', 'mango_salsa', 'mango', 'avocado',...   \n",
      "7   ['Mexican', 'shrimp', 'cream_cheese', 'garlic'...   \n",
      "8   ['Mexican', 'canola_oil', 'acorn_squash', 'red...   \n",
      "9   ['Mexican', 'butter', 'olive_oil', 'parmesan_c...   \n",
      "10  ['South_American', 'egg', 'peel_lime', 'milk',...   \n",
      "11  ['South_American', 'milk', 'vanilla_extract', ...   \n",
      "12  ['South_American', 'plantain', 'butter', 'mozz...   \n",
      "13  ['South_American', 'oil', 'plantain', 'egg', '...   \n",
      "14  ['South_American', 'olive_oil', 'onion', 'cele...   \n",
      "15  ['Italian', 'extra_beef', 'egg', 'breadcrumb',...   \n",
      "16  ['Italian', 'olive_oil', 'garlic', 'onion', 'c...   \n",
      "17  ['Italian', 'olive_oil', 'onion', 'garlic', 'r...   \n",
      "18  ['Italian', 'garlic', 'onion', 'olive_oil', 't...   \n",
      "19  ['Italian', 'butter', 'leek_lengthwise', 'rose...   \n",
      "20  ['Canadian', 'pearl_barley', 'water', 'canola_...   \n",
      "21  ['Canadian', 'molasses', 'brown_sugar', 'ketch...   \n",
      "22  ['Canadian', 'water', 'maple_syrup', 'cranberr...   \n",
      "23  ['Canadian', 'purpose_flour', 'baking_powder',...   \n",
      "24  ['Canadian', 'purpose_flour', 'chili_powder', ...   \n",
      "\n",
      "                                        Ing List Sent  \\\n",
      "0   This recipe from Indian_Subcontinent cuisine c...   \n",
      "1   This recipe from Indian_Subcontinent cuisine c...   \n",
      "2   This recipe from Indian_Subcontinent cuisine c...   \n",
      "3   This recipe from Indian_Subcontinent cuisine c...   \n",
      "4   This recipe from Indian_Subcontinent cuisine c...   \n",
      "5   This recipe from Mexican cuisine contains cumi...   \n",
      "6   This recipe from Mexican cuisine contains mang...   \n",
      "7   This recipe from Mexican cuisine contains shri...   \n",
      "8   This recipe from Mexican cuisine contains cano...   \n",
      "9   This recipe from Mexican cuisine contains butt...   \n",
      "10  This recipe from South_American cuisine contai...   \n",
      "11  This recipe from South_American cuisine contai...   \n",
      "12  This recipe from South_American cuisine contai...   \n",
      "13  This recipe from South_American cuisine contai...   \n",
      "14  This recipe from South_American cuisine contai...   \n",
      "15  This recipe from Italian cuisine contains extr...   \n",
      "16  This recipe from Italian cuisine contains oliv...   \n",
      "17  This recipe from Italian cuisine contains oliv...   \n",
      "18  This recipe from Italian cuisine contains garl...   \n",
      "19  This recipe from Italian cuisine contains butt...   \n",
      "20  This recipe from Canadian cuisine contains pea...   \n",
      "21  This recipe from Canadian cuisine contains mol...   \n",
      "22  This recipe from Canadian cuisine contains wat...   \n",
      "23  This recipe from Canadian cuisine contains pur...   \n",
      "24  This recipe from Canadian cuisine contains pur...   \n",
      "\n",
      "                                             Ing Inst  \\\n",
      "0   This recipe from Indian_Subcontinent cuisine c...   \n",
      "1   This recipe from Indian_Subcontinent cuisine c...   \n",
      "2   This recipe from Indian_Subcontinent cuisine c...   \n",
      "3   This recipe from Indian_Subcontinent cuisine c...   \n",
      "4   This recipe from Indian_Subcontinent cuisine c...   \n",
      "5   This recipe from Mexican cuisine contains cumi...   \n",
      "6   This recipe from Mexican cuisine contains mang...   \n",
      "7   This recipe from Mexican cuisine contains shri...   \n",
      "8   This recipe from Mexican cuisine contains cano...   \n",
      "9   This recipe from Mexican cuisine contains butt...   \n",
      "10  This recipe from South_American cuisine contai...   \n",
      "11  This recipe from South_American cuisine contai...   \n",
      "12  This recipe from South_American cuisine contai...   \n",
      "13  This recipe from South_American cuisine contai...   \n",
      "14  This recipe from South_American cuisine contai...   \n",
      "15  This recipe from Italian cuisine contains extr...   \n",
      "16  This recipe from Italian cuisine contains oliv...   \n",
      "17  This recipe from Italian cuisine contains oliv...   \n",
      "18  This recipe from Italian cuisine contains garl...   \n",
      "19  This recipe from Italian cuisine contains butt...   \n",
      "20  This recipe from Canadian cuisine contains pea...   \n",
      "21  This recipe from Canadian cuisine contains mol...   \n",
      "22  This recipe from Canadian cuisine contains wat...   \n",
      "23  This recipe from Canadian cuisine contains pur...   \n",
      "24  This recipe from Canadian cuisine contains pur...   \n",
      "\n",
      "                                        tidy_ing_inst  \n",
      "0   This recipe from Indian_Subcontinent cuisine c...  \n",
      "1   This recipe from Indian_Subcontinent cuisine c...  \n",
      "2   This recipe from Indian_Subcontinent cuisine c...  \n",
      "3   This recipe from Indian_Subcontinent cuisine c...  \n",
      "4   This recipe from Indian_Subcontinent cuisine c...  \n",
      "5   This recipe from Mexican cuisine contains cumi...  \n",
      "6   This recipe from Mexican cuisine contains mang...  \n",
      "7   This recipe from Mexican cuisine contains shri...  \n",
      "8   This recipe from Mexican cuisine contains cano...  \n",
      "9   This recipe from Mexican cuisine contains butt...  \n",
      "10  This recipe from South_American cuisine contai...  \n",
      "11  This recipe from South_American cuisine contai...  \n",
      "12  This recipe from South_American cuisine contai...  \n",
      "13  This recipe from South_American cuisine contai...  \n",
      "14  This recipe from South_American cuisine contai...  \n",
      "15  This recipe from Italian cuisine contains extr...  \n",
      "16  This recipe from Italian cuisine contains oliv...  \n",
      "17  This recipe from Italian cuisine contains oliv...  \n",
      "18  This recipe from Italian cuisine contains garl...  \n",
      "19  This recipe from Italian cuisine contains butt...  \n",
      "20  This recipe from Canadian cuisine contains pea...  \n",
      "21  This recipe from Canadian cuisine contains mol...  \n",
      "22  This recipe from Canadian cuisine contains wat...  \n",
      "23  This recipe from Canadian cuisine contains pur...  \n",
      "24  This recipe from Canadian cuisine contains pur...  \n"
     ]
    }
   ],
   "source": [
    "# Create an empty list to store the dataframes for each region\n",
    "region_dfs = []\n",
    "\n",
    "# Loop through each unique region\n",
    "for region in df[\"Region\"].unique():\n",
    "    # Select the first 5 rows for the current region\n",
    "    region_df = df[df[\"Region\"] == region].head(5)\n",
    "    # Append the result to the list if the DataFrame is not empty\n",
    "    if not region_df.empty:\n",
    "        region_dfs.append(region_df)\n",
    "\n",
    "# Concatenate all the region-specific dataframes into one\n",
    "if region_dfs:  # Check if the list is not empty\n",
    "    result_df = pd.concat(region_dfs, ignore_index=True)\n",
    "    print(result_df)\n",
    "else:\n",
    "    print(\"No data available for the selected regions.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "3bc3bd51-fb31-4bba-8260-cae716125fcc",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0.5</th>\n",
       "      <th>Unnamed: 0.4</th>\n",
       "      <th>Unnamed: 0.3</th>\n",
       "      <th>Unnamed: 0.2</th>\n",
       "      <th>Unnamed: 0.1</th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Recipe_id</th>\n",
       "      <th>Region</th>\n",
       "      <th>Sub_region</th>\n",
       "      <th>Continent</th>\n",
       "      <th>ingredients</th>\n",
       "      <th>Instructions</th>\n",
       "      <th>Ingredient_List</th>\n",
       "      <th>Ing List Sent</th>\n",
       "      <th>Ing Inst</th>\n",
       "      <th>tidy_ing_inst</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3697</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['dungeness crab', 'turmeric', 'salt', 'mustar...</td>\n",
       "      <td>rub the crabs with  teaspoon of the turmeric a...</td>\n",
       "      <td>['Indian_Subcontinent', 'dungeness_crab', 'tur...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3698</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['tomato', 'cumin', 'turmeric', 'salt', 'water...</td>\n",
       "      <td>bring the tomatoes  cumin  turmeric  salt  and...</td>\n",
       "      <td>['Indian_Subcontinent', 'tomato', 'cumin', 'tu...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>3699</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['vegetable oil', 'onion', 'tomato', 'green ch...</td>\n",
       "      <td>heat  tablespoon oil in a skillet over medium ...</td>\n",
       "      <td>['Indian_Subcontinent', 'vegetable_oil', 'onio...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3700</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['lamb shoulder', 'garam masala', 'salt', 'but...</td>\n",
       "      <td>season the lamb with garam_masala and salt   h...</td>\n",
       "      <td>['Indian_Subcontinent', 'lamb_shoulder', 'gara...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>3701</td>\n",
       "      <td>Indian Subcontinent</td>\n",
       "      <td>Bangladeshi</td>\n",
       "      <td>Asian</td>\n",
       "      <td>['potato', 'pea', 'vegetable oil', 'cumin seed...</td>\n",
       "      <td>bring a medium saucepan of lightly salted wate...</td>\n",
       "      <td>['Indian_Subcontinent', 'potato', 'pea', 'vege...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "      <td>This recipe from Indian_Subcontinent cuisine c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>731</td>\n",
       "      <td>731</td>\n",
       "      <td>731</td>\n",
       "      <td>731</td>\n",
       "      <td>731</td>\n",
       "      <td>731</td>\n",
       "      <td>5704</td>\n",
       "      <td>Mexican</td>\n",
       "      <td>Mexican</td>\n",
       "      <td>Latin American</td>\n",
       "      <td>['cumin', 'chili powder', 'buttery', 'tilapia ...</td>\n",
       "      <td>mix cumin and chili_powder together and sprink...</td>\n",
       "      <td>['Mexican', 'cumin', 'chili_powder', 'buttery'...</td>\n",
       "      <td>This recipe from Mexican cuisine contains cumi...</td>\n",
       "      <td>This recipe from Mexican cuisine contains cumi...</td>\n",
       "      <td>This recipe from Mexican cuisine contains cumi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>732</td>\n",
       "      <td>732</td>\n",
       "      <td>732</td>\n",
       "      <td>732</td>\n",
       "      <td>732</td>\n",
       "      <td>732</td>\n",
       "      <td>5705</td>\n",
       "      <td>Mexican</td>\n",
       "      <td>Mexican</td>\n",
       "      <td>Latin American</td>\n",
       "      <td>['mango salsa', 'mango', 'avocado', 'tomato', ...</td>\n",
       "      <td>stir mango  avocado  tomato  red_onion  red_pe...</td>\n",
       "      <td>['Mexican', 'mango_salsa', 'mango', 'avocado',...</td>\n",
       "      <td>This recipe from Mexican cuisine contains mang...</td>\n",
       "      <td>This recipe from Mexican cuisine contains mang...</td>\n",
       "      <td>This recipe from Mexican cuisine contains mang...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>733</td>\n",
       "      <td>733</td>\n",
       "      <td>733</td>\n",
       "      <td>733</td>\n",
       "      <td>733</td>\n",
       "      <td>733</td>\n",
       "      <td>5706</td>\n",
       "      <td>Mexican</td>\n",
       "      <td>Mexican</td>\n",
       "      <td>Latin American</td>\n",
       "      <td>['shrimp', 'cream cheese', 'garlic', 'cilantro...</td>\n",
       "      <td>preheat oven to  degrees f   stir shrimp  crea...</td>\n",
       "      <td>['Mexican', 'shrimp', 'cream_cheese', 'garlic'...</td>\n",
       "      <td>This recipe from Mexican cuisine contains shri...</td>\n",
       "      <td>This recipe from Mexican cuisine contains shri...</td>\n",
       "      <td>This recipe from Mexican cuisine contains shri...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>734</td>\n",
       "      <td>734</td>\n",
       "      <td>734</td>\n",
       "      <td>734</td>\n",
       "      <td>734</td>\n",
       "      <td>734</td>\n",
       "      <td>5707</td>\n",
       "      <td>Mexican</td>\n",
       "      <td>Mexican</td>\n",
       "      <td>Latin American</td>\n",
       "      <td>['canola oil', 'acorn squash', 'red onion', 's...</td>\n",
       "      <td>heat canola_oil in a deep fryer or heavy pot o...</td>\n",
       "      <td>['Mexican', 'canola_oil', 'acorn_squash', 'red...</td>\n",
       "      <td>This recipe from Mexican cuisine contains cano...</td>\n",
       "      <td>This recipe from Mexican cuisine contains cano...</td>\n",
       "      <td>This recipe from Mexican cuisine contains cano...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>735</td>\n",
       "      <td>735</td>\n",
       "      <td>735</td>\n",
       "      <td>735</td>\n",
       "      <td>735</td>\n",
       "      <td>735</td>\n",
       "      <td>5708</td>\n",
       "      <td>Mexican</td>\n",
       "      <td>Mexican</td>\n",
       "      <td>Latin American</td>\n",
       "      <td>['butter', 'olive oil', 'parmesan cheese', 'it...</td>\n",
       "      <td>preheat an outdoor grill for medium high heat ...</td>\n",
       "      <td>['Mexican', 'butter', 'olive_oil', 'parmesan_c...</td>\n",
       "      <td>This recipe from Mexican cuisine contains butt...</td>\n",
       "      <td>This recipe from Mexican cuisine contains butt...</td>\n",
       "      <td>This recipe from Mexican cuisine contains butt...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0.5  Unnamed: 0.4  Unnamed: 0.3  Unnamed: 0.2  Unnamed: 0.1  \\\n",
       "0             0             0             0             0             0   \n",
       "1             1             1             1             1             1   \n",
       "2             2             2             2             2             2   \n",
       "3             3             3             3             3             3   \n",
       "4             4             4             4             4             4   \n",
       "5           731           731           731           731           731   \n",
       "6           732           732           732           732           732   \n",
       "7           733           733           733           733           733   \n",
       "8           734           734           734           734           734   \n",
       "9           735           735           735           735           735   \n",
       "\n",
       "   Unnamed: 0  Recipe_id               Region   Sub_region       Continent  \\\n",
       "0           0       3697  Indian Subcontinent  Bangladeshi           Asian   \n",
       "1           1       3698  Indian Subcontinent  Bangladeshi           Asian   \n",
       "2           2       3699  Indian Subcontinent  Bangladeshi           Asian   \n",
       "3           3       3700  Indian Subcontinent  Bangladeshi           Asian   \n",
       "4           4       3701  Indian Subcontinent  Bangladeshi           Asian   \n",
       "5         731       5704              Mexican      Mexican  Latin American   \n",
       "6         732       5705              Mexican      Mexican  Latin American   \n",
       "7         733       5706              Mexican      Mexican  Latin American   \n",
       "8         734       5707              Mexican      Mexican  Latin American   \n",
       "9         735       5708              Mexican      Mexican  Latin American   \n",
       "\n",
       "                                         ingredients  \\\n",
       "0  ['dungeness crab', 'turmeric', 'salt', 'mustar...   \n",
       "1  ['tomato', 'cumin', 'turmeric', 'salt', 'water...   \n",
       "2  ['vegetable oil', 'onion', 'tomato', 'green ch...   \n",
       "3  ['lamb shoulder', 'garam masala', 'salt', 'but...   \n",
       "4  ['potato', 'pea', 'vegetable oil', 'cumin seed...   \n",
       "5  ['cumin', 'chili powder', 'buttery', 'tilapia ...   \n",
       "6  ['mango salsa', 'mango', 'avocado', 'tomato', ...   \n",
       "7  ['shrimp', 'cream cheese', 'garlic', 'cilantro...   \n",
       "8  ['canola oil', 'acorn squash', 'red onion', 's...   \n",
       "9  ['butter', 'olive oil', 'parmesan cheese', 'it...   \n",
       "\n",
       "                                        Instructions  \\\n",
       "0  rub the crabs with  teaspoon of the turmeric a...   \n",
       "1  bring the tomatoes  cumin  turmeric  salt  and...   \n",
       "2  heat  tablespoon oil in a skillet over medium ...   \n",
       "3  season the lamb with garam_masala and salt   h...   \n",
       "4  bring a medium saucepan of lightly salted wate...   \n",
       "5  mix cumin and chili_powder together and sprink...   \n",
       "6  stir mango  avocado  tomato  red_onion  red_pe...   \n",
       "7  preheat oven to  degrees f   stir shrimp  crea...   \n",
       "8  heat canola_oil in a deep fryer or heavy pot o...   \n",
       "9  preheat an outdoor grill for medium high heat ...   \n",
       "\n",
       "                                     Ingredient_List  \\\n",
       "0  ['Indian_Subcontinent', 'dungeness_crab', 'tur...   \n",
       "1  ['Indian_Subcontinent', 'tomato', 'cumin', 'tu...   \n",
       "2  ['Indian_Subcontinent', 'vegetable_oil', 'onio...   \n",
       "3  ['Indian_Subcontinent', 'lamb_shoulder', 'gara...   \n",
       "4  ['Indian_Subcontinent', 'potato', 'pea', 'vege...   \n",
       "5  ['Mexican', 'cumin', 'chili_powder', 'buttery'...   \n",
       "6  ['Mexican', 'mango_salsa', 'mango', 'avocado',...   \n",
       "7  ['Mexican', 'shrimp', 'cream_cheese', 'garlic'...   \n",
       "8  ['Mexican', 'canola_oil', 'acorn_squash', 'red...   \n",
       "9  ['Mexican', 'butter', 'olive_oil', 'parmesan_c...   \n",
       "\n",
       "                                       Ing List Sent  \\\n",
       "0  This recipe from Indian_Subcontinent cuisine c...   \n",
       "1  This recipe from Indian_Subcontinent cuisine c...   \n",
       "2  This recipe from Indian_Subcontinent cuisine c...   \n",
       "3  This recipe from Indian_Subcontinent cuisine c...   \n",
       "4  This recipe from Indian_Subcontinent cuisine c...   \n",
       "5  This recipe from Mexican cuisine contains cumi...   \n",
       "6  This recipe from Mexican cuisine contains mang...   \n",
       "7  This recipe from Mexican cuisine contains shri...   \n",
       "8  This recipe from Mexican cuisine contains cano...   \n",
       "9  This recipe from Mexican cuisine contains butt...   \n",
       "\n",
       "                                            Ing Inst  \\\n",
       "0  This recipe from Indian_Subcontinent cuisine c...   \n",
       "1  This recipe from Indian_Subcontinent cuisine c...   \n",
       "2  This recipe from Indian_Subcontinent cuisine c...   \n",
       "3  This recipe from Indian_Subcontinent cuisine c...   \n",
       "4  This recipe from Indian_Subcontinent cuisine c...   \n",
       "5  This recipe from Mexican cuisine contains cumi...   \n",
       "6  This recipe from Mexican cuisine contains mang...   \n",
       "7  This recipe from Mexican cuisine contains shri...   \n",
       "8  This recipe from Mexican cuisine contains cano...   \n",
       "9  This recipe from Mexican cuisine contains butt...   \n",
       "\n",
       "                                       tidy_ing_inst  \n",
       "0  This recipe from Indian_Subcontinent cuisine c...  \n",
       "1  This recipe from Indian_Subcontinent cuisine c...  \n",
       "2  This recipe from Indian_Subcontinent cuisine c...  \n",
       "3  This recipe from Indian_Subcontinent cuisine c...  \n",
       "4  This recipe from Indian_Subcontinent cuisine c...  \n",
       "5  This recipe from Mexican cuisine contains cumi...  \n",
       "6  This recipe from Mexican cuisine contains mang...  \n",
       "7  This recipe from Mexican cuisine contains shri...  \n",
       "8  This recipe from Mexican cuisine contains cano...  \n",
       "9  This recipe from Mexican cuisine contains butt...  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result_df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "fb76a3a3-3193-4a04-a9b3-d90d0a7c94d0",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "df_it = df[df[\"Region\"]==\"Italian\"]\n",
    "df_mex= df[df[\"Region\"]==\"Mexican\"]\n",
    "df_can= df[df[\"Region\"]==\"Canadian\"]\n",
    "df_sa= df[df[\"Region\"]==\"South_American\"]\n",
    "df_in= df[df[\"Region\"]==\"Indian_Subcontinent\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "00aeb2f8-68d4-4571-a804-6becdb073f16",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "tqdm.pandas(desc='progress-bar')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "6da22e4c-e3ba-4224-a5fb-78660b4661f6",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def add_label(txt):\n",
    "    output= []\n",
    "    for i,s in zip(txt.index,txt):\n",
    "        output.append(TaggedDocument(s,[\"Recipe_\"+str(i)]))\n",
    "    return output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "cb06b4bb-b075-4946-b0d9-be76573f3100",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "labeled_txt= add_label(tokenized_txt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "31c6ce17-d862-467a-a941-5c1f0962d96a",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[TaggedDocument(words='This recipe from Indian_Subcontinent cuisine contains dungeness_crab turmeric salt mustard_seed water mustard_oil red_onion potato_lengthwise clove cinnamon green_cardamom_pod black_peppercorn tomato thai_green_chile garlic_paste ginger_paste cayenne_pepper white_sugar garnish wedge_lemon cilantro ingredients crabs with teaspoon turmeric teaspoon salt them marinate hour combine mustard_seed hot_water small bowl stand minutes mortar pestle grind seeds into coarse paste heat kadhai over medium heat crabs stir until they change color about minutes remove crabs from aside sliced onions cook stir over medium heat until onions translucent about minutes raise heat high potatoes cook stirring constantly about minutes cloves cinnamon stick cardamom_pods peppercorns stir thirty seconds stir tomatoes ginger_paste garlic_paste halve three chiles them cook stir additional minute over high heat reduce heat medium remaining teaspoon turmeric cayenne_pepper mustard_paste stir combine crabs pour just enough water cover vegetables bring water boil stir sugar salt taste cover reduce heat simmer until potatoes tender water reduced half about minutes remove stir simmer until gravy thickened about minutes more squeeze lemon_wedge over finished dish garnish with chopped cilantro sliced green_chile serve with rice', tags=['Recipe_0']),\n",
       " TaggedDocument(words='This recipe from Indian_Subcontinent cuisine contains tomato cumin turmeric salt water vegetable_oil whitefish_fillet mustard_seed cumin_seed black_cumin_seed fennel_seed fenugreek_seed ingredients bring tomatoes cumin turmeric salt water boil four quart saucepan reduce heat medium maintain simmer heat skillet over medium high heat cook fish oiled skillet until golden brown minutes side transfer fish saucepan heat separate skillet over medium heat toast mustard_seeds cumin_seeds black_cumin_seeds fennel_seeds fenugreek_seeds until fragrant just seconds stir spices into saucepan simmer until flavors integrate about minutes more serve', tags=['Recipe_1']),\n",
       " TaggedDocument(words='This recipe from Indian_Subcontinent cuisine contains vegetable_oil onion tomato green_chile_pepper garlic_paste ginger_paste coriander cumin garam_masala turmeric cayenne_pepper salt water potato cilantro ingredients heat tablespoon skillet over medium heat stir onions cook stir until onion pieces soft translucent about minutes reduce heat medium continue cooking stirring occasionally until onions golden about minutes more remove from heat aside transfer cooked onions bowl food processor grind onions tomatoes green_chiles into smooth paste heat remaining tablespoons same skillet over medium heat onion mixture carefully spatter cook stir minutes stir garlic_paste ginger_paste coriander cumin garam_masala turmeric cayenne_pepper salt cook stir until begins separate from mixture about minutes pour water into skillet bring sauce boil cubed potatoes gently stir eggs reduce heat simmer until sauce thickened slightly about minutes remove from heat garnish with cilantro before serving', tags=['Recipe_2'])]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labeled_txt[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "c7fe7f2f-5776-4fde-a8de-674c2ece8866",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████| 51349/51349 [00:00<00:00, 3720003.39it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8min 26s, sys: 12.4 s, total: 8min 38s\n",
      "Wall time: 2min 57s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "model_d2v = gensim.models.Doc2Vec(dm=1, # dm = 1 for ‘distributed memory’ model\n",
    "                                  dm_mean=1, # dm_mean = 1 for using mean of the context word vectors\n",
    "                                  vector_size=100, # no. of desired features\n",
    "                                  window=10, # width of the context window                                  \n",
    "                                  negative=7, # if > 0 then negative sampling will be used\n",
    "                                  min_count=1, # Ignores all words with total frequency lower than 1.                                  \n",
    "                                  workers=4, # no. of cores                                  \n",
    "                                  alpha=0.1, # learning rate                                  \n",
    "                                  seed = 23, # for reproducibility) \n",
    "                                 )\n",
    "model_d2v.build_vocab([i for i in tqdm(labeled_txt)])\n",
    "\n",
    "model_d2v.train(labeled_txt, total_examples= len(df['tidy_ing_inst']), epochs=15)                                  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "99f46ca6-8eff-4d00-ba59-2a0d594307ad",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(51349, 100)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docvec_arrays = np.zeros((len(tokenized_txt), 100)) \n",
    "for i in range(len(df)):\n",
    "    docvec_arrays[i,:] = model_d2v.docvecs[i].reshape((1,100))    \n",
    "\n",
    "docvec_df = pd.DataFrame(docvec_arrays) \n",
    "docvec_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "d626651c-dfeb-4eae-8e5b-0474147a5a1c",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>90</th>\n",
       "      <th>91</th>\n",
       "      <th>92</th>\n",
       "      <th>93</th>\n",
       "      <th>94</th>\n",
       "      <th>95</th>\n",
       "      <th>96</th>\n",
       "      <th>97</th>\n",
       "      <th>98</th>\n",
       "      <th>99</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.047435</td>\n",
       "      <td>-1.397575</td>\n",
       "      <td>-0.352318</td>\n",
       "      <td>-0.650628</td>\n",
       "      <td>-0.766790</td>\n",
       "      <td>0.255121</td>\n",
       "      <td>-0.399228</td>\n",
       "      <td>-0.152939</td>\n",
       "      <td>1.125835</td>\n",
       "      <td>-0.776052</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.495638</td>\n",
       "      <td>-0.207134</td>\n",
       "      <td>-0.441011</td>\n",
       "      <td>-0.103301</td>\n",
       "      <td>0.604050</td>\n",
       "      <td>0.379858</td>\n",
       "      <td>-0.316622</td>\n",
       "      <td>-0.045448</td>\n",
       "      <td>-0.740693</td>\n",
       "      <td>0.253146</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.942616</td>\n",
       "      <td>-1.073195</td>\n",
       "      <td>-0.267142</td>\n",
       "      <td>-0.407136</td>\n",
       "      <td>-0.286008</td>\n",
       "      <td>0.157557</td>\n",
       "      <td>-0.658977</td>\n",
       "      <td>0.201305</td>\n",
       "      <td>-0.111158</td>\n",
       "      <td>-0.103662</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.231018</td>\n",
       "      <td>-0.038742</td>\n",
       "      <td>-0.441324</td>\n",
       "      <td>-0.120024</td>\n",
       "      <td>0.719968</td>\n",
       "      <td>0.382789</td>\n",
       "      <td>0.428318</td>\n",
       "      <td>0.000282</td>\n",
       "      <td>-0.358434</td>\n",
       "      <td>0.788527</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.331891</td>\n",
       "      <td>-1.444541</td>\n",
       "      <td>0.076602</td>\n",
       "      <td>0.110054</td>\n",
       "      <td>-0.203973</td>\n",
       "      <td>0.399367</td>\n",
       "      <td>-0.298804</td>\n",
       "      <td>-0.395904</td>\n",
       "      <td>0.357500</td>\n",
       "      <td>-0.233924</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.666679</td>\n",
       "      <td>0.606347</td>\n",
       "      <td>-0.522236</td>\n",
       "      <td>-0.414603</td>\n",
       "      <td>0.429737</td>\n",
       "      <td>0.275531</td>\n",
       "      <td>-0.059719</td>\n",
       "      <td>-0.120812</td>\n",
       "      <td>-0.198372</td>\n",
       "      <td>0.367137</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.489481</td>\n",
       "      <td>-0.677273</td>\n",
       "      <td>0.377267</td>\n",
       "      <td>-0.137194</td>\n",
       "      <td>0.236240</td>\n",
       "      <td>-0.052438</td>\n",
       "      <td>-0.495107</td>\n",
       "      <td>-0.074659</td>\n",
       "      <td>0.117526</td>\n",
       "      <td>-0.922850</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.006133</td>\n",
       "      <td>-0.093471</td>\n",
       "      <td>-0.939735</td>\n",
       "      <td>0.080330</td>\n",
       "      <td>0.034031</td>\n",
       "      <td>-0.292428</td>\n",
       "      <td>0.221175</td>\n",
       "      <td>0.189336</td>\n",
       "      <td>0.014682</td>\n",
       "      <td>0.851447</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.165744</td>\n",
       "      <td>-0.231994</td>\n",
       "      <td>-0.091892</td>\n",
       "      <td>0.146815</td>\n",
       "      <td>-0.204355</td>\n",
       "      <td>-0.040721</td>\n",
       "      <td>-0.327735</td>\n",
       "      <td>0.351024</td>\n",
       "      <td>-0.562044</td>\n",
       "      <td>-0.350277</td>\n",
       "      <td>...</td>\n",
       "      <td>0.316733</td>\n",
       "      <td>0.024035</td>\n",
       "      <td>0.067238</td>\n",
       "      <td>-0.381888</td>\n",
       "      <td>0.543412</td>\n",
       "      <td>0.092810</td>\n",
       "      <td>0.226611</td>\n",
       "      <td>0.295057</td>\n",
       "      <td>0.201822</td>\n",
       "      <td>0.017615</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 100 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         0         1         2         3         4         5         6   \\\n",
       "0 -0.047435 -1.397575 -0.352318 -0.650628 -0.766790  0.255121 -0.399228   \n",
       "1 -0.942616 -1.073195 -0.267142 -0.407136 -0.286008  0.157557 -0.658977   \n",
       "2 -0.331891 -1.444541  0.076602  0.110054 -0.203973  0.399367 -0.298804   \n",
       "3 -0.489481 -0.677273  0.377267 -0.137194  0.236240 -0.052438 -0.495107   \n",
       "4  0.165744 -0.231994 -0.091892  0.146815 -0.204355 -0.040721 -0.327735   \n",
       "\n",
       "         7         8         9   ...        90        91        92        93  \\\n",
       "0 -0.152939  1.125835 -0.776052  ... -0.495638 -0.207134 -0.441011 -0.103301   \n",
       "1  0.201305 -0.111158 -0.103662  ... -0.231018 -0.038742 -0.441324 -0.120024   \n",
       "2 -0.395904  0.357500 -0.233924  ... -0.666679  0.606347 -0.522236 -0.414603   \n",
       "3 -0.074659  0.117526 -0.922850  ... -0.006133 -0.093471 -0.939735  0.080330   \n",
       "4  0.351024 -0.562044 -0.350277  ...  0.316733  0.024035  0.067238 -0.381888   \n",
       "\n",
       "         94        95        96        97        98        99  \n",
       "0  0.604050  0.379858 -0.316622 -0.045448 -0.740693  0.253146  \n",
       "1  0.719968  0.382789  0.428318  0.000282 -0.358434  0.788527  \n",
       "2  0.429737  0.275531 -0.059719 -0.120812 -0.198372  0.367137  \n",
       "3  0.034031 -0.292428  0.221175  0.189336  0.014682  0.851447  \n",
       "4  0.543412  0.092810  0.226611  0.295057  0.201822  0.017615  \n",
       "\n",
       "[5 rows x 100 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docvec_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "ee2f5a08-8150-4474-9c47-f7069028accb",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Indian Subcontinent' 'Mexican' 'South American' 'Italian' 'Canadian']\n"
     ]
    }
   ],
   "source": [
    "categories = df[\"Region\"].unique()\n",
    "print(categories)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "f928b10d-d87c-4ab1-ba67-59b8a30d654f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "cuisine_emb_dict={}\n",
    "for cuisine in categories:\n",
    "    indices= df[df[\"Region\"]==cuisine].index\n",
    "    selected_rows=docvec_df.loc[indices]\n",
    "    cuisine_emb_dict[cuisine]=np.mean(selected_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "93ad4c21-e2d4-4a70-a3ea-e192e99e2e76",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "100"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(cuisine_emb_dict[\"Italian\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "bb8775cf-e6a4-40c2-ab31-e7b406a74ae6",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('cuisine_emb_dict_Doc2Vec.pkl', 'wb') as f:\n",
    "    pickle.dump(cuisine_emb_dict, f)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99c839fc-76b5-41e7-845e-39a8de0b2d5a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
