{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Takes this Kaggle dataset 'Recipes from Tasty' https://www.kaggle.com/datasets/zeeenb/recipes-from-tasty?select=ingredient_and_instructions.json, and turns them into basic dialogue using a preset list of user prompt templates."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ONE_STEP_TEMPLATES = [\n",
    "    \"How do I cook {title}?\",\n",
    "    \"How do I make {title}?\",\n",
    "    \"How do you make {title}?\",\n",
    "    \"Help me make {title}.\",\n",
    "    \"Tell me how to make {title}.\",\n",
    "    \"How do I prepare {title}?\",\n",
    "    \"Could you tell me how to prepare {title}?\",\n",
    "    \"Have you got a recipe for {title}?\",\n",
    "    \"Do you have a recipe for {title}?\",\n",
    "    \"Could I have the recipe for {title}?\",\n",
    "    \"Do you know how to make {title}?\",\n",
    "    \"How do I go about making {title}?\",\n",
    "    \"Can you tell me how to make {title}?\",\n",
    "]\n",
    "\n",
    "# TWO_STEP_TEMPLATES_1 = [\"What ingredients do I need to make {title}?\",\"What ingredients do I need to cook {title}?\",\"What do I need to make {title}?\",\"What do I need to cook {title}?\"]\n",
    "\n",
    "# TWO_STEP_TEMPLATES_2 = [\"What are the steps?\",\"How do I prepare it?\",\"How do I cook it?\",\"How can I cook it?\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import kaggle\n",
    "import pandas as pd\n",
    "import json\n",
    "import random\n",
    "import unicodedata\n",
    "import re\n",
    "from fractions import Fraction\n",
    "from IPython.display import display\n",
    "from datasets import Dataset\n",
    "\n",
    "data_source = \"https://www.kaggle.com/datasets/zeeenb/recipes-from-tasty\"\n",
    "output_dir = \"data\"\n",
    "os.makedirs(output_dir, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert fraction unicode characters to strings (e.g. ½ -> 1/2)\n",
    "def convert_fraction_unicode_chars_to_strings(string):\n",
    "    output = \"\"\n",
    "    i = 0\n",
    "    while i < len(string):\n",
    "        char = string[i]\n",
    "        try:\n",
    "            if unicodedata.name(char).startswith(\"VULGAR FRACTION\"):  # check if the character is a fraction\n",
    "                val = unicodedata.numeric(char)\n",
    "                # if the current character is a fraction, find the end of the fraction\n",
    "                j = i + 1\n",
    "                while j < len(string):\n",
    "                    next_char = string[j]\n",
    "                    if not unicodedata.name(next_char).startswith(\n",
    "                        \"VULGAR FRACTION\"\n",
    "                    ):  # break if next character is not a fraction\n",
    "                        break\n",
    "                    next_val = unicodedata.numeric(next_char)\n",
    "                    val = val * 10 + next_val\n",
    "                    j += 1\n",
    "                # convert the numeric value to a Fraction object and then to a string with a maximum of 2 digits\n",
    "                fraction = str(Fraction(val).limit_denominator(100))\n",
    "                output += fraction\n",
    "                i = j\n",
    "            else:\n",
    "                # if the current character is not a fraction, simply add it to the output\n",
    "                output += char\n",
    "                i += 1\n",
    "        except ValueError:\n",
    "            # if the character does not have a name, skip it\n",
    "            i += 1\n",
    "    return output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kaggle.api.dataset_download_files(\"zeeenb/recipes-from-tasty\", \"data\", unzip=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dishes = pd.read_csv(\"data/dishes.csv\", usecols=[\"language\", \"name\", \"slug\"])\n",
    "# Remove non-English recipes\n",
    "dishes = dishes[dishes[\"language\"] == \"eng\"]\n",
    "# Open ingredient_and_instructions.json and extract instructions\n",
    "ingredient_and_instructions = json.load(open(\"data/ingredient_and_instructions.json\"))\n",
    "\n",
    "# Create dataframe with columns INSTRUCTION, RESPONSE, SOURCE\n",
    "# The INSTRUCTION a random choice from ONE_STEP_TEMPLATES with the title of the recipe filled in\n",
    "# The RESPONSE is the ingredients and instructions for the recipe concatenated\n",
    "# The SOURCE is the recipe title\n",
    "recipes = []\n",
    "for index, row in dishes.iterrows():\n",
    "    recipe_name = row[\"name\"]\n",
    "    # Remove the following phrases from the recipe name (ignoring case)\n",
    "    recipe_name = re.sub(\"How to Make \", \"\", recipe_name, flags=re.IGNORECASE)\n",
    "\n",
    "    # Concatenate ingredients from ingredient_and_instructions[row[\"slug\"]][\"ingredients_sections\"]\n",
    "    ingredients = \"\"\n",
    "    for section in ingredient_and_instructions[row[\"slug\"]][\"ingredient_sections\"]:\n",
    "        if section[\"name\"] != \"\":\n",
    "            ingredients += f\"\\n{section['name']}\\n\"\n",
    "        for ingredient in section[\"ingredients\"]:\n",
    "            primary_unit_quantity = \"\"\n",
    "            if ingredient[\"primary_unit\"][\"quantity\"] != \"\" and ingredient[\"primary_unit\"][\"quantity\"] is not None:\n",
    "                primary_unit_quantity = (\n",
    "                    convert_fraction_unicode_chars_to_strings(ingredient[\"primary_unit\"][\"quantity\"]) + \" \"\n",
    "                )\n",
    "            primary_unit_display = \"\"\n",
    "            if ingredient[\"primary_unit\"][\"display\"] != \"\" and ingredient[\"primary_unit\"][\"display\"] is not None:\n",
    "                primary_unit_display = ingredient[\"primary_unit\"][\"display\"] + \" \"\n",
    "            extra_comment = ingredient[\"extra_comment\"]\n",
    "            if ingredient[\"extra_comment\"] != \"\":\n",
    "                extra_comment = \", \" + ingredient[\"extra_comment\"]\n",
    "            ingredients += f\"\\n• {primary_unit_quantity}{primary_unit_display}{ingredient['name']}{extra_comment}\"\n",
    "        ingredients += \"\\n\"\n",
    "\n",
    "    # Concatenate instructions from ingredient_and_instructions[row[\"slug\"]][\"instructions\"] and iterate a number per instruction\n",
    "    instructions = \"\"\n",
    "    # Remove last instruction if it is \"Enjoy!\"\n",
    "    if ingredient_and_instructions[row[\"slug\"]][\"instructions\"][-1][\"display_text\"] == \"Enjoy!\":\n",
    "        ingredient_and_instructions[row[\"slug\"]][\"instructions\"] = ingredient_and_instructions[row[\"slug\"]][\n",
    "            \"instructions\"\n",
    "        ][:-1]\n",
    "    for i, instruction in enumerate(ingredient_and_instructions[row[\"slug\"]][\"instructions\"]):\n",
    "        instructions += f\"\\n{i+1}. {convert_fraction_unicode_chars_to_strings(instruction['display_text'])}\"\n",
    "\n",
    "    # Constuct the full response\n",
    "    response = f\"\"\"Here's a recipe for {recipe_name}:\n",
    "\n",
    "Ingredients:\n",
    "{ingredients}\n",
    "Instructions:\n",
    "{instructions}\n",
    "\n",
    "Enjoy your {recipe_name}!\"\"\"\n",
    "    recipes.append(\n",
    "        {\n",
    "            \"INSTRUCTION\": random.choice(ONE_STEP_TEMPLATES).format(title=recipe_name),\n",
    "            \"RESPONSE\": response,\n",
    "            \"SOURCE\": data_source,\n",
    "        }\n",
    "    )\n",
    "recipes = pd.DataFrame(recipes)\n",
    "\n",
    "# Print the first 5 rows of the dataframe with full width and newline characters correctly displayed in the RESPONSE column\n",
    "with pd.option_context(\"display.max_colwidth\", -1):\n",
    "    # Assuming the variable df contains the relevant DataFrame\n",
    "    display(\n",
    "        recipes.head().style.set_properties(\n",
    "            **{\n",
    "                \"text-align\": \"left\",\n",
    "                \"white-space\": \"pre-wrap\",\n",
    "            }\n",
    "        )\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Upload dataset to HF\n",
    "recipes.to_parquet(\"dataset.parquet\", row_group_size=100, engine=\"pyarrow\")\n",
    "ds = Dataset.from_parquet(\"dataset.parquet\")\n",
    "# Uncomment to push dataset to HF\n",
    "ds.push_to_hub(\"dctanner/oa_recipes\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "281f1c8753b18c9d2968280632816a025c721e632f5f355c2f6dfab2614fba3c"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
