{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/lidong1/miniconda3/envs/llama_factory/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['dataset', 'id', 'messages'],\n",
       "        num_rows: 326154\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "dataset = load_dataset(\"allenai/tulu-v2-sft-mixture\")\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# judge the prompt or response is empty\n",
    "def flag_empty(e):\n",
    "    e[\"empty\"] = False\n",
    "    for m in e[\"messages\"]:\n",
    "        if m[\"content\"].strip() == \"\" and m[\"role\"] != \"system\":\n",
    "            e[\"empty\"] = True\n",
    "    return e\n",
    "\n",
    "_dataset = dataset.map(flag_empty, num_proc=8)\n",
    "_empty_ds = _dataset.filter(lambda x: x[\"empty\"], num_proc=8)\n",
    "_dataset_notempty = _dataset.filter(lambda x: not x[\"empty\"], num_proc=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['dataset', 'id', 'messages', 'empty'],\n",
       "        num_rows: 270\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_empty_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # export to jsonl\n",
    "# export_dataset = _dataset_notempty.select_columns([\"dataset\", \"id\", \"messages\"])\n",
    "# export_dataset[\"train\"].to_json(\"./tulu_v2.json\",num_proc=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "export data length 325884\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "export_dataset = _dataset_notempty.select_columns([\"dataset\", \"id\", \"messages\"])\n",
    "export_data_list = [dict(row) for row in export_dataset['train']]\n",
    "print(f\"export data length {len(export_data_list)}\")\n",
    "with open('../data/tulu_v2.json', 'w', encoding='utf-8') as f:  \n",
    "    json.dump(export_data_list, f, ensure_ascii=False, indent=4)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'dataset': 'flan_v2',\n",
       "  'id': 'flan_v2_0',\n",
       "  'messages': [{'role': 'user',\n",
       "    'content': 'Question: Gdańsk (, ; German: \"\" , ) is a Polish city on the Baltic coast. It is the capital of the Pomeranian Voivodeship, Poland\\'s principal seaport and is also the centre of the country\\'s fourth-largest metropolitan area. \\n\\nThe city lies on the southern edge of Gdańsk Bay (of the Baltic Sea), in a conurbation with the city of Gdynia, spa town of Sopot, and suburban communities, which together form a metropolitan area called the Tricity (\"Trójmiasto\"), with a population approaching 1.4 million. Gdańsk itself has a population of 460,427 (December 2012), making it the largest city in the Pomerania region of Northern Poland. \\n\\nGdańsk is the capital of Gdańsk Pomerania and the largest city of Kashubia. With its origins as a Polish stronghold erected in the 980s by Mieszko I of Poland, the city\\'s history is complex, with periods of Polish rule, periods of Prussian or German rule, and periods of autonomy or self-rule as a \"free city\". Between the world wars, the Free City of Danzig was in a customs union with Poland and was located between German East Prussia and the so-called Polish Corridor. \\n\\nGdańsk lies at the mouth of the Motława River, connected to the Leniwka, a branch in the delta of the nearby Vistula River, which drains 60 percent of Poland and connects Gdańsk with the Polish capital, Warsaw. Together with the nearby port of Gdynia, Gdańsk is also an important industrial center. In the late Middle Ages it was an important seaport and shipbuilding town, and in the 14th and 15th centuries a member of the Hanseatic League. 1. Which bay is Gdansk on?\\n2. Which sea is that bay part of?\\n3. Is Gdansk a capital?\\n4. Of what?\\n5. Who was the first ruler?\\n6. Name one of the countries that has ruled it?\\n7. And another?\\n8. And one more?\\n9. Has it ever been under self-rule?\\n10. What was it called between the world wars?\\n11. Is it near a river?\\n12. Which one?\\n13. Which is connected to ? Provide a numbered list of answers.\\n****\\nA numbered of answers: 1. Gdańsk Bay\\n2. the Baltic Sea\\n3. Yes\\n4. Pomeranian Voivodeship\\n5. Mieszko I\\n6. Poland\\n7. German\\n8. Prussian\\n9. Yes\\n10. Danzig\\n11. Yes\\n12. the Motława River\\n13. the Leniwka\\nQ: Ferguson, Missouri (CNN) -- As the St. Louis suburb of Ferguson calmed Friday after nights of protests over the fatal shooting of an unarmed black teen, the question remains: Where\\'s the police officer who pulled the trigger? \\n\\nOfficer Darren Wilson, 28, shot Michael Brown on August 9. The shooting sparked days of violent protests in Ferguson as residents demanded his arrest. \\n\\nSeparate federal and local investigations are under way, and Wilson -- who has received death threats -- has disappeared from public view. \\n\\nGovernor orders drawdown of National Guard in Ferguson \\n\\nHere\\'s what is known about his whereabouts. \\n\\nWhere is he now? \\n\\nFew outside Wilson\\'s family and authorities know for sure. \\n\\nWilson owns a house in a modest neighborhood about 20 miles from Ferguson. He bought the house shortly after he was divorced last year, according to the St. Louis Post-Dispatch. \\n\\nBut several neighbors have told CNN that Wilson left home before his name was released last week. \\n\\nWhat are his neighbors saying about his whereabouts? \\n\\nNot much. Most have shunned reporters\\' requests for interviews, and some put signs in their yards shooing away journalists. \\n\\n\"We don\\'t know anything ... Pray for Peace,\" one read, according to the Post-Dispatch. \\n\\n\"We have 2 children. Do not knock!! No comment,\" another family wrote. \\n\\nAny trails on social media? \\n\\nThe newspaper reported that Wilson deactivated his social media accounts before his name went public. \\n\\nThe only social media presence for him now is from supporters, who have set up Facebook pages to support and raise money for him. \\n\\n1. unarmed black teen\\n2. Darren Wilson shot Michael Brown\\n3. Yes\\n4. Few outside Wilson\\'s family and authorities know for sure.\\n5. about 20 miles from Ferguson\\n6. No\\n7. Most have shunned reporters\\' requests for interviews, and some put signs in their yards shooing away journalists\\n8. federal and local investigations\\n9. 28\\n10. unknown\\n11. The shooting sparked days of violent protests\\n12. his arrest\\n13. Most have not\\n14. that Wilson left home\\n15. shortly after he was divorced\\n16. unknown\\n17. Yes\\n18. We have 2 children. Do not knock\\n19. no\\n20. Ferguson\\n\\nNumbered questions:\\n1. Who was Michael Brown?\\n2. What happened to him?\\n3. Was Darren Wilson a cop?\\n4. Where is Wilson now?\\n5. Where does he live?\\n6. Do his neighbors know where he is?\\n7. How have they handled the publicity?\\n8. What kind of ongoing investigations are there?\\n9. How old is Wilson?\\n10. Why did he shoot Michael Brown?\\n11. How did local people react to this?\\n12. What were they hoping to accomplish with the protests?\\n13. Have any of his neighbors spoken to press?\\n14. What did they tell CNN?\\n15. When did Wilson buy the house?\\n16. When was that?\\n17. Did he receive death threats?\\n18. What do the signs in the neighbors yards say?\\n19. Is Wilson active online?\\n20. What  town was Brown shot in?\\ninput: (CNN) -- At least one performer fell hard for Sunday night\\'s Billboard Music Awards. \\n\\nNot sure what that means? Well, check out the top five moments from Sunday night\\'s 2013 Billboard Music Awards: \\n\\n1. Miguel lands on a fan \\n\\nThe R & B singer accidentally landed on a woman in the mosh pit during a performance of his hit song \"Adorn.\" He was attempting a jump that went wrong. The fan appeared to be fine and the singer kept singing. Miguel later tweeted: \"got caught up in the moment, thank goodness Khyati is okay.\" \\n\\n2. Taylor Swift wins eight out of the 11 awards she was up for \\n\\nSwift is no stranger to taking to the stage to accept accolades, and on Sunday night she collected a few, including Billboard Artist of the Year. \\n\\n\"My album is kind of on the ends of the intense emotional spectrum,\" Swift said while accepting that award. \"You (fans) are the longest and best relationship I have ever had.\" \\n\\nShe also won Top Country Artist,Top Billboard 200 Artist, Top Female Artist, and Top Digital Songs Artist -- the last one a tie with singer Carly Rae Jepsen. Swift\\'s album \"Red\" won in the Top Billboard 200 and Country Album categories and her single \"We Are Never Ever Getting Back Together\" collected the trophy for Top Country Song. \\n\\n3. Justin Bieber gets booed \\n\\nWhile accepting the first ever Milestone Award, the Biebs was both cheered and jeered. He appeared to reference the rough times he has had of late in his acceptance speech.  1. Who was the one performer who fell hard?\\n2. was the fan ok?\\n3. how many awards Taylor Swift won?\\n4. Why did Justin get booed?\\n5. Who was Carly Rae Jepsen tied with ?\\n6. what song got top country song?\\n7. Was the fan female or male that Miguel fell on?\\n8. What did bieber talk about in his acceptance speech?\\n9. what swift\\'s ablum won the top billboard 200?\\n10. Was \"Adorn\" a hit song? Return numbered answers in your output.\\n\\noutput:'},\n",
       "   {'role': 'assistant',\n",
       "    'content': '1. Miguel\\n2. Yes\\n3. Eight\\n4. unknown\\n5. Swift\\n6. \"We Are Never Ever Getting Back Together\"\\n7. Female\\n8. The rough times he has had of late.\\n9. \"Red\"\\n10. Yes'}]},\n",
       " {'dataset': 'flan_v2',\n",
       "  'id': 'flan_v2_1',\n",
       "  'messages': [{'role': 'user',\n",
       "    'content': 'Q: A.S. Roma, chairman, James Pallotta; James Pallotta, birthPlace, Boston\\nA: James Pallotta, who was born in Boston, is the chairman of A.S. Roma.\\nQ: Egg Harbor Township, New Jersey, isPartOf, Atlantic County, New Jersey\\nA: Egg Harbor Township is a township in Atlantic County, New Jersey, United States.\\nQ: Hypermarcas, type, S.A. (corporation); Hypermarcas, numberOfEmployees, 10252\\nA: Hypermarcas is an S.A. corporation which employs 10,252 people.\\nQ: Singapore, language, English language; Ayam penyet, region, Singapore\\nA:'},\n",
       "   {'role': 'assistant',\n",
       "    'content': 'English is one of the languages spoken in Singapore where ayam penyet comes from.'}]}]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "with open(\"../data/tulu_v2.json\", \"r\") as f:\n",
    "    data = json.load(f)\n",
    "data[:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "325884"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'assistant', 'system', 'user'}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# role type\n",
    "def get_role(e):\n",
    "    e['role'] = [x['role'] for x in e['messages']]\n",
    "    return e\n",
    "data_role = dataset.map(get_role, num_proc=8)\n",
    "role_list = data_role['train']['role']\n",
    "roles = [item for sublist in role_list for item in sublist]\n",
    "set(roles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'lima', 'oasst1', 'sharegpt'}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dataset['train'][0]['messages'])\n",
    "def data_length(e):\n",
    "    e['length'] = len(e[\"messages\"])\n",
    "    return e\n",
    "dataset_length = dataset.map(data_length, num_proc=8)\n",
    "dataset_length\n",
    "# mutil turns dataset\n",
    "dataset_mutil_turn = dataset_length.filter(lambda x: x['length'] > 3, num_proc=8)\n",
    "dataset_mutil_turn\n",
    "set(dataset_mutil_turn['train']['dataset'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "#dialogue turns count\n",
    "from collections import Counter  \n",
    "data_length = dataset_length['train']['length']\n",
    "counter = Counter(data_length) \n",
    "# for value, count in counter.items():  \n",
    "#     print(f\"Value: {value}, Count: {count}\")  \n",
    "import pandas as pd\n",
    "data_length_distribution = pd.Series(counter)\n",
    "data_length_distribution.sort_index(inplace=True)\n",
    "data_length_distribution.to_csv(\"./turns_distribution.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llama_factory",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
