{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/XXXX-30/XXXX-29/XXXX-31/scratch/XXXX-1/frontier_conda_simpleton/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset, load_from_disk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def prepare_text(row, text_field=\"text\"):\n",
    "    \"\"\"Passthrough for text fields (or other single, named fields).\"\"\"\n",
    "    return row.get(text_field)\n",
    "\n",
    "\n",
    "def prepare_openorca(row, **kwargs):\n",
    "    \"\"\"Example using the OpenOrca fields.\"\"\"\n",
    "    dbl_nl = \"\\n\\n\"\n",
    "    return (\n",
    "        f\"{row['system_prompt']+dbl_nl if row['system_prompt']!='' else ''}{row['question']}{dbl_nl}{row['response']}\"\n",
    "    )\n",
    "\n",
    "\n",
    "def prepare_ultrachat(row, **kwargs):\n",
    "    \"\"\"Example for UltraChat format.\"\"\"\n",
    "    sgl_nl = \"\\n\"\n",
    "    dbl_nl = \"\\n\\n\"\n",
    "    dialog_turns = [f\"{x['role'].capitalize()}:{sgl_nl}{x['content']}\" for x in row[\"messages\"]]\n",
    "    return dialog_turns\n",
    "\n",
    "\n",
    "def prepare_alpaca_train(row, **kwargs):\n",
    "    \"\"\"Example for Alpaca training format.\"\"\"\n",
    "    if row.get(\"input\") not in [\"\", None]:\n",
    "        return f\"\"\"\\\n",
    "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n",
    "\n",
    "### Instruction:\n",
    "{row['instruction']}\n",
    "\n",
    "### Input:\n",
    "{row['input']}\n",
    "\n",
    "### Response:\n",
    "{row['output']}\\\n",
    "\"\"\"\n",
    "    else:\n",
    "        return f\"\"\"\\\n",
    "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
    "\n",
    "### Instruction:\n",
    "{row['instruction']}\n",
    "\n",
    "### Response:\n",
    "{row['output']}\\\n",
    "\"\"\"\n",
    "\n",
    "def prepare_alpaca_eval(row, **kwargs):\n",
    "    \"\"\"Example for Alpaca eval format.\"\"\"\n",
    "    return f\"\"\"\\\n",
    "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
    "\n",
    "### Instruction:\n",
    "{row['instruction']}\n",
    "\n",
    "### Response:\n",
    "{row['output']}\\\n",
    "\"\"\"\n",
    "\n",
    "def prepare_alpaca_eval_chat(row, **kwargs):\n",
    "    \"\"\"Example for Alpaca eval in chat/messages format.\"\"\"\n",
    "    tokenizer = kwargs.get(\"tokenizer\", None)\n",
    "\n",
    "    return \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# basic settings\n",
    "BASE_SAVE_DIR = \"/XXXX-30/XXXX-29/XXXX-31/scratch/XXXX-1/llm-pretraining-root/output/data\"\n",
    "NUM_PROC=64\n",
    "\n",
    "# Hub settings\n",
    "ORG=\"XXXX-6\"\n",
    "TYPE=\"dataset\"\n",
    "PRIVATE=True\n",
    "TOKEN=\"<TOKEN>\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_name_or_path = \"yahma/alpaca-cleaned\"\n",
    "ds_config=None\n",
    "split=\"train\"\n",
    "ds = load_dataset(ds_name_or_path, ds_config, split=split)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['instruction', 'input', 'output'],\n",
       "    num_rows: 51760\n",
       "})"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map (num_proc=64): 100%|██████████| 51760/51760 [00:00<00:00, 208297.64 examples/s]\n"
     ]
    }
   ],
   "source": [
    "prepared_ds = ds.map(lambda x: {\"text\":prepare_alpaca_train(x)}, remove_columns=ds.column_names, num_proc=NUM_PROC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text'],\n",
       "    num_rows: 51760\n",
       "})"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prepared_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
      "\n",
      "### Instruction:\n",
      "Give three tips for staying healthy.\n",
      "\n",
      "### Response:\n",
      "1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n",
      "\n",
      "2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n",
      "\n",
      "3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.\n"
     ]
    }
   ],
   "source": [
    "print(prepared_ds[0][\"text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Saving the dataset (64/64 shards): 100%|██████████| 51760/51760 [00:00<00:00, 107018.23 examples/s]\n"
     ]
    }
   ],
   "source": [
    "prepared_ds.save_to_disk(f\"{BASE_SAVE_DIR}/{ds_name_or_path.replace('/', '_').replace('-', '_')}\", num_proc=NUM_PROC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/XXXX-30/XXXX-29/XXXX-31/scratch/XXXX-1/frontier_conda_simpleton/lib/python3.11/site-packages/datasets/load.py:1461: FutureWarning: The repository for tatsu-lab/alpaca_eval contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/tatsu-lab/alpaca_eval\n",
      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
      "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "ds_name_or_path = \"tatsu-lab/alpaca_eval\"\n",
    "ds_config=None\n",
    "split=\"eval\"\n",
    "ds = load_dataset(ds_name_or_path, ds_config, split=split)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['instruction', 'output', 'generator', 'dataset'],\n",
       "    num_rows: 805\n",
       "})"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map (num_proc=64): 100%|██████████| 805/805 [00:00<00:00, 3704.42 examples/s]\n"
     ]
    }
   ],
   "source": [
    "prepared_ds = ds.map(lambda x: {\"text\":prepare_alpaca_eval(x)}, remove_columns=ds.column_names, num_proc=NUM_PROC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text'],\n",
       "    num_rows: 805\n",
       "})"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prepared_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
      "\n",
      "### Instruction:\n",
      "What are the names of some famous actors that started their careers on Broadway?\n",
      "\n",
      "### Response:\n",
      "Some famous actors that started their careers on Broadway include: \n",
      "1. Hugh Jackman \n",
      "2. Meryl Streep \n",
      "3. Denzel Washington \n",
      "4. Julia Roberts \n",
      "5. Christopher Walken \n",
      "6. Anthony Rapp \n",
      "7. Audra McDonald \n",
      "8. Nathan Lane \n",
      "9. Sarah Jessica Parker \n",
      "10. Lin-Manuel Miranda\n"
     ]
    }
   ],
   "source": [
    "print(prepared_ds[0][\"text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Saving the dataset (64/64 shards): 100%|██████████| 805/805 [00:00<00:00, 1791.60 examples/s]\n"
     ]
    }
   ],
   "source": [
    "prepared_ds.save_to_disk(f\"{BASE_SAVE_DIR}/{ds_name_or_path.replace('/', '_').replace('-', '_')}\", num_proc=NUM_PROC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading readme: 100%|██████████| 387/387 [00:00<00:00, 4.05MB/s]\n",
      "Repo card metadata block was not found. Setting CardData to empty.\n",
      "Downloading data: 100%|██████████| 99.6M/99.6M [00:00<00:00, 106MB/s] \n",
      "Generating train split: 54974 examples [00:00, 93291.08 examples/s]\n"
     ]
    }
   ],
   "source": [
    "ds_name_or_path = \"cognitivecomputations/WizardLM_alpaca_evol_instruct_70k_unfiltered\"\n",
    "ds_config=None\n",
    "split=\"train\"\n",
    "ds = load_dataset(ds_name_or_path, ds_config, split=split)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['instruction', 'output'],\n",
       "    num_rows: 54974\n",
       "})"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map (num_proc=64): 100%|██████████| 54974/54974 [00:00<00:00, 220623.36 examples/s]\n"
     ]
    }
   ],
   "source": [
    "prepared_ds = ds.map(lambda x: {\"text\":prepare_alpaca_train(x)}, remove_columns=ds.column_names, num_proc=NUM_PROC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text'],\n",
       "    num_rows: 54974\n",
       "})"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prepared_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
      "\n",
      "### Instruction:\n",
      "Can you provide a list of healthy habits to maintain a healthy lifestyle? Please format your response as an HTML page with bullet points.\n",
      "<html>\n",
      "  <body>\n",
      "    <h3>Healthy Habits:</h3>\n",
      "    <ul>\n",
      "      <li>Eating a balanced diet with plenty of fruits and vegetables.</li>\n",
      "      <li>Engaging in regular physical activity, such as walking, running, or cycling.</li>\n",
      "      <li>Getting enough sleep each night, ideally 7-8 hours.</li>\n",
      "      <li>Staying hydrated by drinking plenty of water throughout the day.</li>\n",
      "      <li>Limiting alcohol consumption and avoiding smoking.</li>\n",
      "      <li>Managing stress through relaxation techniques like meditation or yoga.</li>\n",
      "      <li>Regularly visiting a healthcare provider for check-ups and preventative care.</li>\n",
      "    </ul>\n",
      "  </body>\n",
      "</html>\n",
      "\n",
      "### Response:\n",
      "Here's an HTML page with bullet points for healthy habits:\n",
      "<html>\n",
      "  <body>\n",
      "    <h3>Healthy Habits:</h3>\n",
      "    <ul>\n",
      "      <li>Eating a balanced diet with plenty of fruits and vegetables.</li>\n",
      "      <li>Engaging in regular physical activity, such as walking, running, or cycling.</li>\n",
      "      <li>Getting enough sleep each night, ideally 7-8 hours.</li>\n",
      "      <li>Staying hydrated by drinking plenty of water throughout the day.</li>\n",
      "      <li>Limiting alcohol consumption and avoiding smoking.</li>\n",
      "      <li>Managing stress through relaxation techniques like meditation or yoga.</li>\n",
      "      <li>Regularly visiting a healthcare provider for check-ups and preventative care.</li>\n",
      "    </ul>\n",
      "  </body>\n",
      "</html>\n"
     ]
    }
   ],
   "source": [
    "print(prepared_ds[0][\"text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Saving the dataset (64/64 shards): 100%|██████████| 54974/54974 [00:00<00:00, 115751.90 examples/s]\n"
     ]
    }
   ],
   "source": [
    "prepared_ds.save_to_disk(f\"{BASE_SAVE_DIR}/{ds_name_or_path.replace('/', '_').replace('-', '_')}\", num_proc=NUM_PROC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading readme: 100%|██████████| 1.76k/1.76k [00:00<00:00, 8.43MB/s]\n",
      "Downloading data: 100%|██████████| 216M/216M [00:04<00:00, 44.7MB/s] \n",
      "Generating train split: 100%|██████████| 297097/297097 [00:01<00:00, 227886.56 examples/s]\n"
     ]
    }
   ],
   "source": [
    "ds_name_or_path = \"candenizkocak/code-alpaca-297k\"\n",
    "ds_config=None\n",
    "split=\"train\"\n",
    "ds = load_dataset(ds_name_or_path, ds_config, split=split)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['instruction', 'input', 'output', 'source'],\n",
       "    num_rows: 297097\n",
       "})"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map (num_proc=64): 100%|██████████| 297097/297097 [00:01<00:00, 237589.55 examples/s]\n"
     ]
    }
   ],
   "source": [
    "prepared_ds = ds.map(lambda x: {\"text\":prepare_alpaca_train(x)}, remove_columns=ds.column_names, num_proc=NUM_PROC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text'],\n",
       "    num_rows: 297097\n",
       "})"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prepared_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
      "\n",
      "### Instruction:\n",
      "Create a nested loop to print every combination of numbers between 0-9, excluding any combination that contains the number 5. Additionally, exclude any combination that contains a repeating digit. Implement the solution without using any built-in functions or libraries to check for repeating digits.\n",
      "\n",
      "### Response:\n",
      "Here is an example of a nested loop in Python to print every combination of numbers between 0-9, excluding any combination that contains the number 5 or repeating digits:\n",
      "\n",
      "```python\n",
      "for i in range(10):  # First digit\n",
      "    for j in range(10):  # Second digit\n",
      "        for k in range(10):  # Third digit\n",
      "            # Checking for the conditions\n",
      "            if i != 5 and j != 5 and k != 5 and i != j and i != k and j != k:\n",
      "                print(i, j, k)\n",
      "```\n",
      "\n",
      "This code will generate and print every combination of three digits between 0-9 that do not contain the number 5 and do not have any repeating digits.\n"
     ]
    }
   ],
   "source": [
    "print(prepared_ds[0][\"text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Saving the dataset (0/64 shards):   0%|          | 0/297097 [00:00<?, ? examples/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Saving the dataset (64/64 shards): 100%|██████████| 297097/297097 [00:00<00:00, 604556.62 examples/s]\n"
     ]
    }
   ],
   "source": [
    "prepared_ds.save_to_disk(f\"{BASE_SAVE_DIR}/{ds_name_or_path.replace('/', '_').replace('-', '_')}\", num_proc=NUM_PROC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
