{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Removes 70240 sessions.\n",
      "From 143000 sessions.\n",
      "Remain 72760 sessions.\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import random\n",
    "# TODO: the file ignore the duplicated ids\n",
    "\n",
    "def read_json_file(file_path):\n",
    "    with open(file_path, 'r', encoding=\"utf-8\") as file:\n",
    "        json_data = json.load(file)\n",
    "    return json_data\n",
    "\n",
    "wl_raw_list = read_json_file(r'\\data\\to-generate\\WizardLM_evol_instruct_V2_143k.json')\n",
    "\n",
    "# remove samples with emty q/a, samples from alpaca\n",
    "wl_cleaned_list = []\n",
    "for session_dict in wl_raw_list:\n",
    "    if session_dict['idx'].startswith('alpaca_'):\n",
    "        continue \n",
    "    \n",
    "    cleaned_session_dict = dict()\n",
    "    cleaned_session_dict['id'] = session_dict['idx']\n",
    "\n",
    "    if len(session_dict['conversations']) != 2:\n",
    "        continue\n",
    "\n",
    "    session_list = []\n",
    "    for utter_dict in session_dict['conversations']:\n",
    "        if not utter_dict['value']:\n",
    "            break\n",
    "        session_list.append(utter_dict)\n",
    "    \n",
    "    if len(session_list) != 2: \n",
    "        continue\n",
    "    \n",
    "    cleaned_session_dict['conversations'] = session_list\n",
    "\n",
    "    wl_cleaned_list.append(cleaned_session_dict)\n",
    "\n",
    "removal_of_sessions = len(wl_raw_list)-len(wl_cleaned_list)\n",
    "\n",
    "print(f'Removes {removal_of_sessions} sessions.\\nFrom {len(wl_raw_list)} sessions.\\nRemain {len(wl_cleaned_list)} sessions.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "random_seed = 125\n",
    "random.seed(random_seed)\n",
    "random.shuffle(wl_cleaned_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': 'Igdcujz',\n",
       " 'conversations': [{'from': 'human',\n",
       "   'value': 'As an experienced chef, you are expected to create a beef cheek dish that can surpass the expectations of even the most discerning food critics. This dish necessitates a comprehensive approach that involves selecting the ideal cut of meat, preparing a bespoke marinade, and utilizing the sous vide cooking method with the utmost precision to achieve unparalleled tenderness and flavor. Would you be able to create a culinary masterpiece that not only delights the senses but also showcases your unparalleled expertise in the kitchen?'},\n",
       "  {'from': 'gpt',\n",
       "   'value': \"Absolutely! With my knowledge and expertise in culinary arts, I am confident that I can create a mouth-watering beef cheek dish that will leave a lasting impression on anyone who tries it. \\r\\n\\r\\nFirstly, I would select the best quality beef cheek, ensuring that it has the perfect marbling and texture. Then, I would create a bespoke marinade using a combination of red wine, garlic, thyme, rosemary, and other herbs and spices that complement the flavors of the meat. I would let the beef cheek marinate in the mixture for at least 24 hours to allow the flavors to penetrate the meat fully.\\r\\n\\r\\nWhen it's time to cook the beef cheek, I would use the sous vide method to ensure maximum tenderness and flavor. I would set the temperature to around 135°F and cook it for 24-48 hours, depending on the size of the cut. The sous vide method ensures that the meat is cooked evenly, retains its moisture, and develops a melt-in-your-mouth texture.\\r\\n\\r\\nFinally, I would serve the beef cheek with a side of mashed potatoes, roasted root vegetables, or any other side dish that complements its rich flavor. To finish off the dish, I would garnish it with fresh herbs and a drizzle of the marinade for an extra burst of flavor.\\r\\n\\r\\nWith this comprehensive approach, I am confident that I can create a beef cheek dish that surpasses even the highest expectations.\"}]}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check\n",
    "wl_cleaned_list[0] # 'id': 'Igdcujz'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10000\n"
     ]
    }
   ],
   "source": [
    "wl_10k = wl_cleaned_list[:10000]\n",
    "print(len(wl_10k))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('wizard_sg_10k.json', 'w', encoding=\"utf-8\" ) as file:\n",
    "    # json.dump(wl_10k, file, indent=2, ensure_ascii=False)\n",
    "    json.dump(wl_10k, file, ensure_ascii=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "inquirygpt-cli",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
