{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import random\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_ultra_data(file_path):\n",
    "    with open(file_path, 'r', encoding=\"utf-8\") as file:\n",
    "        session_list = file.readlines()\n",
    "    return session_list\n",
    "d = read_ultra_data(r'\\corpus\\dolly\\databricks-dolly-15k.jsonl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "d_lst = []\n",
    "for session_str in d:\n",
    "    d_lst.append(eval(session_str))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "15011"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(d_lst)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def stratified_sampling(data, key, sample_size, random_seed):\n",
    "    random.seed(random_seed)  \n",
    "    \n",
    "    count_dict = {}\n",
    "    for item in data:\n",
    "        value = item[key]\n",
    "        count_dict[value] = count_dict.get(value, 0) + 1\n",
    "\n",
    "    total_samples = len(data)\n",
    "    proportions = {k: v / total_samples for k, v in count_dict.items()}\n",
    "    \n",
    "    sample_sizes = {k: round(v * sample_size) for k, v in proportions.items()}\n",
    "    \n",
    "    sampled_data = []\n",
    "    for value, sample_size in sample_sizes.items():\n",
    "        group = [item for item in data if item[key] == value]\n",
    "        if len(group) < sample_size:\n",
    "            sampled_data.extend(group)\n",
    "        elif len(group) == sample_size:\n",
    "            sampled_data.extend(group)\n",
    "        else:\n",
    "            sampled_data.extend(random.sample(group, sample_size))\n",
    "    \n",
    "    return sampled_data\n",
    "\n",
    "sampled_data = stratified_sampling(d_lst, 'category', 10000, 125)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "category\n",
       "open_qa                   2493\n",
       "general_qa                1460\n",
       "classification            1423\n",
       "closed_qa                 1181\n",
       "brainstorming             1176\n",
       "information_extraction    1003\n",
       "summarization              791\n",
       "creative_writing           472\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample_df = pd.DataFrame(sampled_data)\n",
    "sample_df['category'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'instruction': 'Given these paragraphs about the founding of Tesla, who else part from Elon Musk and J. B. Straubel are co-founders of Tesla?',\n",
       " 'context': 'The company was incorporated as Tesla Motors, Inc. on July 1, 2003, by Martin Eberhard and Marc Tarpenning. Eberhard and Tarpenning served as CEO and CFO, respectively. Eberhard said he wanted to build \"a car manufacturer that is also a technology company\", with its core technologies as \"the battery, the computer software, and the proprietary motor\".\\n\\nIan Wright was Tesla\\'s third employee, joining a few months later. In February 2004, the company raised $7.5 million in series A funding, including $6.5 million from Elon Musk, who had received $100 million from the sale of his interest in PayPal two years earlier. Musk became the chairman of the board of directors and the largest shareholder of Tesla. J. B. Straubel joined Tesla in May 2004 as chief technical officer.\\n\\nA lawsuit settlement agreed to by Eberhard and Tesla in September 2009 allows all five – Eberhard, Tarpenning, Wright, Musk, and Straubel – to call themselves co-founders.',\n",
       " 'response': 'Eberhard, Tarpenning and Wright',\n",
       " 'category': 'closed_qa'}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check\n",
    "sampled_data[0]\n",
    "# 'instruction': 'Given these paragraphs about the founding of Tesla, who else part from Elon Musk and J. B. Straubel are co-founders of Tesla?',\n",
    "#  'context': 'The company was inc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9999"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(sampled_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Supplement one session\n",
    "for session_dict in d_lst:\n",
    "    if session_dict['category'] == 'open_qa':\n",
    "        if str(session_dict) in str(sampled_data):\n",
    "            continue\n",
    "        else:\n",
    "            sampled_data.append(session_dict)\n",
    "            break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10000"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(sampled_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "category\n",
       "open_qa                   2494\n",
       "general_qa                1460\n",
       "classification            1423\n",
       "closed_qa                 1181\n",
       "brainstorming             1176\n",
       "information_extraction    1003\n",
       "summarization              791\n",
       "creative_writing           472\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample_df = pd.DataFrame(sampled_data)\n",
    "sample_df['category'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Post-processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'instruction': 'Given these paragraphs about the founding of Tesla, who else part from Elon Musk and J. B. Straubel are co-founders of Tesla?',\n",
       " 'context': 'The company was incorporated as Tesla Motors, Inc. on July 1, 2003, by Martin Eberhard and Marc Tarpenning. Eberhard and Tarpenning served as CEO and CFO, respectively. Eberhard said he wanted to build \"a car manufacturer that is also a technology company\", with its core technologies as \"the battery, the computer software, and the proprietary motor\".\\n\\nIan Wright was Tesla\\'s third employee, joining a few months later. In February 2004, the company raised $7.5 million in series A funding, including $6.5 million from Elon Musk, who had received $100 million from the sale of his interest in PayPal two years earlier. Musk became the chairman of the board of directors and the largest shareholder of Tesla. J. B. Straubel joined Tesla in May 2004 as chief technical officer.\\n\\nA lawsuit settlement agreed to by Eberhard and Tesla in September 2009 allows all five – Eberhard, Tarpenning, Wright, Musk, and Straubel – to call themselves co-founders.',\n",
       " 'response': 'Eberhard, Tarpenning and Wright',\n",
       " 'category': 'closed_qa'}"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sampled_data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_q_lst = []\n",
    "for i, session_dict in enumerate(sampled_data):\n",
    "    new_dict = dict()\n",
    "    new_dict['id'] = i\n",
    "    if session_dict['category'] == 'closed_qa':\n",
    "        q = 'This is a Closed Question-Answering task.\\n' + session_dict['instruction']  + '\\n' + session_dict['context']  \n",
    "    elif session_dict['category'] == 'summarization':\n",
    "        q = 'This is a Summarization task.\\n' + session_dict['instruction'] + '\\n' + session_dict['context']\n",
    "    elif session_dict['category'] == 'information_extraction':\n",
    "        q = 'This is a Information Extraction task.\\n' + session_dict['instruction'] + '\\n' + session_dict['context']\n",
    "    else:\n",
    "        q = session_dict['instruction']\n",
    "    new_dict['question'] = q\n",
    "    merged_q_lst.append(new_dict)\n",
    "assert len(sampled_data) == len(merged_q_lst)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': 0,\n",
       " 'question': 'This is a Closed Question-Answering task.\\nGiven these paragraphs about the founding of Tesla, who else part from Elon Musk and J. B. Straubel are co-founders of Tesla?\\nThe company was incorporated as Tesla Motors, Inc. on July 1, 2003, by Martin Eberhard and Marc Tarpenning. Eberhard and Tarpenning served as CEO and CFO, respectively. Eberhard said he wanted to build \"a car manufacturer that is also a technology company\", with its core technologies as \"the battery, the computer software, and the proprietary motor\".\\n\\nIan Wright was Tesla\\'s third employee, joining a few months later. In February 2004, the company raised $7.5 million in series A funding, including $6.5 million from Elon Musk, who had received $100 million from the sale of his interest in PayPal two years earlier. Musk became the chairman of the board of directors and the largest shareholder of Tesla. J. B. Straubel joined Tesla in May 2004 as chief technical officer.\\n\\nA lawsuit settlement agreed to by Eberhard and Tesla in September 2009 allows all five – Eberhard, Tarpenning, Wright, Musk, and Straubel – to call themselves co-founders.'}"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged_q_lst[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, session_dict in enumerate(merged_q_lst):\n",
    "    new_q = session_dict['question'].encode('UTF-8', 'ignore').decode('UTF-8')\n",
    "    merged_q_lst[i]['question'] = new_q"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10000"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(merged_q_lst)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dolly_q_10k.json', \"w\", encoding=\"utf-8\") as f:\n",
    "    json.dump(merged_q_lst, f, indent=2, ensure_ascii=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mt-bench",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
