{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Takes this Kaggle dataset 'leetcode-solutions'\n",
    "https://www.kaggle.com/datasets/erichartford/leetcode-solutions, and turns them into basic\n",
    "dialogue using a preset list of user prompt templates."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ONE_STEP_TEMPLATES = [\n",
    "    \"Can you write a program in ${lang} where\\n${content}\",\n",
    "    \"How would you implement a function in ${lang} that\\n${content}\",\n",
    "    \"Write a ${lang} function for\\n${content}\",\n",
    "    \"Can you create a ${lang} program that\\n${content}\",\n",
    "    \"Implement a function in ${lang} to\\n${content}\",\n",
    "    \"Write a ${lang} script for\\n${content}\",\n",
    "    \"How would you code a program in ${lang} to\\n${content}\",\n",
    "    \"Create a ${lang} function for\\n${content}\",\n",
    "    \"Write a ${lang} program that can\\n${content}\",\n",
    "    \"Can you implement a function in ${lang} that\\n${content}\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import kaggle\n",
    "import pandas as pd\n",
    "import random\n",
    "from IPython.display import display\n",
    "from datasets import Dataset\n",
    "import requests\n",
    "\n",
    "data_source = \"https://www.kaggle.com/datasets/erichartford/leetcode-solutions\"\n",
    "lc_contests_data_source = \"https://github.com/Nan-Do/LeetCodeContestsDataset/raw/main/submissions.json\"\n",
    "\n",
    "output_dir = \"data\"\n",
    "os.makedirs(output_dir, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kaggle.api.dataset_download_files(\"erichartford/leetcode-solutions\", \"data\", unzip=True)\n",
    "r = requests.get(lc_contests_data_source, allow_redirects=True)\n",
    "with open(\"data/lc_contests.json\", \"wb\") as f:\n",
    "    for chunk in r.iter_content(chunk_size=1024):\n",
    "        if chunk:\n",
    "            f.write(chunk)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "leetcode_solutions = pd.read_json(\"data/leetcode-solutions.jsonl\", lines=True)\n",
    "leetcode_contests = pd.read_json(\"data/lc_contests.json\")\n",
    "\n",
    "# Create dataframe with columns INSTRUCTION, RESPONSE, SOURCE\n",
    "# The INSTRUCTION a random choice from ONE_STEP_TEMPLATES with the language and content filled in\n",
    "# The RESPONSE is the answer to the question being posed\n",
    "# The SOURCE is the URL of the dataset\n",
    "oa_leet10k = []\n",
    "for index, row in leetcode_solutions.iterrows():\n",
    "    content = row[\"content\"]\n",
    "    for lang in [\"c++\", \"java\", \"javascript\", \"python\"]:\n",
    "        if lang in row[\"answer\"]:\n",
    "            oa_leet10k.append(\n",
    "                {\n",
    "                    \"INSTRUCTION\": random.choice(ONE_STEP_TEMPLATES)\n",
    "                    .replace(\"${lang}\", lang)\n",
    "                    .replace(\"${content}\", content),\n",
    "                    \"RESPONSE\": row[\"answer\"][lang],\n",
    "                    \"SOURCE\": data_source,\n",
    "                }\n",
    "            )\n",
    "\n",
    "oa_leetcode_contests = []\n",
    "for index, row in leetcode_contests.iterrows():\n",
    "    oa_leetcode_contests.append(\n",
    "        {\n",
    "            \"INSTRUCTION\": row[\"instruction\"] + \"\\n\" + row[\"input\"],\n",
    "            \"RESPONSE\": row[\"output\"],\n",
    "            \"SOURCE\": \"https://github.com/Nan-Do/LeetCodeContestsDataset\",\n",
    "        }\n",
    "    )\n",
    "\n",
    "oa_leet10k = pd.DataFrame(oa_leet10k)\n",
    "oa_leetcode_contests = pd.DataFrame(oa_leetcode_contests)\n",
    "\n",
    "print(f\"oa_leet10k: {oa_leet10k.shape[0]}, oa_leetcode_contests: {oa_leetcode_contests.shape[0]}\")\n",
    "\n",
    "# Print the first 5 rows of the dataframe with full width and newline characters correctly displayed in the RESPONSE column\n",
    "with pd.option_context(\"display.max_colwidth\", 80):\n",
    "    # Assuming the variable df contains the relevant DataFrame\n",
    "    display(\n",
    "        oa_leet10k.head(5).style.set_properties(\n",
    "            **{\n",
    "                \"text-align\": \"left\",\n",
    "                \"white-space\": \"pre-wrap\",\n",
    "            }\n",
    "        ),\n",
    "        oa_leetcode_contests.head(5).style.set_properties(\n",
    "            **{\n",
    "                \"text-align\": \"left\",\n",
    "                \"white-space\": \"pre-wrap\",\n",
    "            }\n",
    "        ),\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Upload dataset to HF\n",
    "oa_leet10k.to_parquet(\"oa_leet10k.parquet\", row_group_size=100, engine=\"pyarrow\")\n",
    "ds_leet10k = Dataset.from_parquet(\"oa_leet10k.parquet\")\n",
    "oa_leetcode_contests.to_parquet(\"oa_leetcode_contests.parquet\", row_group_size=100, engine=\"pyarrow\")\n",
    "ds_leetcode_contests = Dataset.from_parquet(\"oa_leetcode_contests.parquet\")\n",
    "# Uncomment to push dataset to HF\n",
    "# ds_leet10k.push_to_hub(\"ehartford/oa_leet10k\")\n",
    "# ds_leetcode_contests.push_to_hub(\"ehartford/oa_leet10k\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "281f1c8753b18c9d2968280632816a025c721e632f5f355c2f6dfab2614fba3c"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
