{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5dbfc2fe",
   "metadata": {},
   "source": [
    "# chatbot_arena\n",
    "The dataset is based on `lmsys/chatbot_arena_conversations`, which contains 33K cleaned conversations with pairwise human preferences collected from 13K unique IP addresses on the Chatbot Arena from April to June 2023."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ead4ae2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "ds = load_dataset('GAIR/preference-dissection', split='train', token=\"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\")\n",
    "ds.to_json(\"../dataset/chatbot_arena.json\")\n",
    "\n",
    "ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4084fedf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'analyzing_general',\n",
       " 'asking_how_to_question',\n",
       " 'brainstorming',\n",
       " 'chitchat',\n",
       " 'classification_identification',\n",
       " 'code_correction_rewriting',\n",
       " 'code_generation',\n",
       " 'code_simplification',\n",
       " 'code_to_code_translation',\n",
       " 'counterfactual',\n",
       " 'creative_writing',\n",
       " 'data_analysis',\n",
       " 'default',\n",
       " 'explaining_code',\n",
       " 'explaining_general',\n",
       " 'functional_writing',\n",
       " 'information_extraction',\n",
       " 'instructional_rewriting',\n",
       " 'keywords_extraction',\n",
       " 'language_polishing',\n",
       " 'math_reasoning',\n",
       " 'note_summarization',\n",
       " 'open_question',\n",
       " 'paraphrasing',\n",
       " 'planning',\n",
       " 'question_generation',\n",
       " 'ranking',\n",
       " 'reading_comprehension',\n",
       " 'recommendation',\n",
       " 'rejecting',\n",
       " 'roleplay',\n",
       " 'seeking_advice',\n",
       " 'solving_exam_question_with_math',\n",
       " 'solving_exam_question_without_math',\n",
       " 'text_correction',\n",
       " 'text_simplification',\n",
       " 'text_summarization',\n",
       " 'text_to_text_translation',\n",
       " 'title_generation',\n",
       " 'value_judgement',\n",
       " 'verifying_fact',\n",
       " 'writing_advertisement',\n",
       " 'writing_blog_post',\n",
       " 'writing_cooking_recipe',\n",
       " 'writing_email',\n",
       " 'writing_job_application',\n",
       " 'writing_legal_document',\n",
       " 'writing_marketing_materials',\n",
       " 'writing_news_article',\n",
       " 'writing_personal_essay',\n",
       " 'writing_presentation_script',\n",
       " 'writing_product_description',\n",
       " 'writing_scientific_paper',\n",
       " 'writing_social_media_post',\n",
       " 'writing_song_lyrics',\n",
       " 'writing_technical_document'}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(ds['scenario_auto-j'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0aab0664",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Advice',\n",
       " 'Code',\n",
       " 'Communication',\n",
       " 'Creative Writing',\n",
       " 'Daily Tasks',\n",
       " 'Exam Questions',\n",
       " 'Functional Writing',\n",
       " 'Knowledge-aware',\n",
       " 'NLP Tasks',\n",
       " 'Others',\n",
       " 'Unsafe Queries'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(ds['scenario_group'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "59420e1a",
   "metadata": {},
   "source": [
    "# PPE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9af84c0c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating test split: 100%|██████████| 16038/16038 [00:01<00:00, 14764.78 examples/s]\n",
      "Filter: 100%|██████████| 16038/16038 [00:01<00:00, 15985.79 examples/s]\n",
      "Creating json from Arrow format: 100%|██████████| 11/11 [00:01<00:00,  9.84ba/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "70885644"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds = load_dataset('lmarena-ai/PPE-Human-Preference-V1', split='test')\n",
    "ds = ds.filter(lambda ex: ex[\"winner\"] in [\"model_a\", \"model_b\"])\n",
    "ds.to_json(\"../dataset/PPE_HF.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "492d9b65",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question_id</th>\n",
       "      <th>model_a</th>\n",
       "      <th>model_b</th>\n",
       "      <th>winner</th>\n",
       "      <th>prompt</th>\n",
       "      <th>response_1</th>\n",
       "      <th>response_2</th>\n",
       "      <th>language</th>\n",
       "      <th>conv_metadata</th>\n",
       "      <th>is_code</th>\n",
       "      <th>...</th>\n",
       "      <th>category_tag</th>\n",
       "      <th>sim</th>\n",
       "      <th>length_a</th>\n",
       "      <th>length_b</th>\n",
       "      <th>token_diff</th>\n",
       "      <th>longer_won</th>\n",
       "      <th>hard_prompt</th>\n",
       "      <th>easy_prompt</th>\n",
       "      <th>if_prompt</th>\n",
       "      <th>math_prompt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>f4c6889138ae4867a686df2f35080a3a</td>\n",
       "      <td>mistral-large-2402</td>\n",
       "      <td>gpt-4o-2024-05-13</td>\n",
       "      <td>model_a</td>\n",
       "      <td>I'm a developer building a chrome extension. I...</td>\n",
       "      <td>From your description, it seems like you are t...</td>\n",
       "      <td>The issue seems to be related to how the `chro...</td>\n",
       "      <td>English</td>\n",
       "      <td>{'context_a_tokens': 191, 'context_b_tokens': ...</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>{'criteria_v0.1': {'complexity': True, 'creati...</td>\n",
       "      <td>0.827569</td>\n",
       "      <td>546</td>\n",
       "      <td>380</td>\n",
       "      <td>0.695971</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3c547616a9a0433481a2f32b06311a36</td>\n",
       "      <td>phi-3-medium-4k-instruct</td>\n",
       "      <td>llama-3-70b-instruct</td>\n",
       "      <td>model_b</td>\n",
       "      <td>Repeat the question and then answer it: Matteo...</td>\n",
       "      <td>Question: Matteo has 20 apples and buys 20 ora...</td>\n",
       "      <td>Here is the question:\\n\\nMatteo has 20 apples,...</td>\n",
       "      <td>English</td>\n",
       "      <td>{'context_a_tokens': 51, 'context_b_tokens': 5...</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>{'criteria_v0.1': {'complexity': True, 'creati...</td>\n",
       "      <td>0.958751</td>\n",
       "      <td>211</td>\n",
       "      <td>246</td>\n",
       "      <td>0.857724</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>efca77bd70ce41c8acb6e806f2509705</td>\n",
       "      <td>gpt-4-1106-preview</td>\n",
       "      <td>gpt-4o-2024-05-13</td>\n",
       "      <td>model_a</td>\n",
       "      <td>Your response must begin with 'なんJ 栗の花大学 生唾芸術C...</td>\n",
       "      <td>なんJ 栗の花大学 生唾芸術CreamPan部\\nたっぷりUnchartedな世界で満足して...</td>\n",
       "      <td>なんJ 栗の花大学 生唾芸術CreamPan部\\nたっぷりUnchartedな世界で満足して...</td>\n",
       "      <td>English</td>\n",
       "      <td>{'context_a_tokens': 665, 'context_b_tokens': ...</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>{'criteria_v0.1': {'complexity': False, 'creat...</td>\n",
       "      <td>0.841135</td>\n",
       "      <td>1077</td>\n",
       "      <td>964</td>\n",
       "      <td>0.895079</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ae8804ad877941a29412972c07b006ab</td>\n",
       "      <td>llama-3-8b-instruct</td>\n",
       "      <td>command-r</td>\n",
       "      <td>model_a</td>\n",
       "      <td>in angular app, i want to create a dropdown, b...</td>\n",
       "      <td>```\\n&lt;div&gt;\\n  &lt;select&gt;\\n    &lt;ng-container *ngF...</td>\n",
       "      <td>Here's an example of how you could create a dr...</td>\n",
       "      <td>English</td>\n",
       "      <td>{'context_a_tokens': 48, 'context_b_tokens': 4...</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>{'criteria_v0.1': {'complexity': True, 'creati...</td>\n",
       "      <td>0.723722</td>\n",
       "      <td>94</td>\n",
       "      <td>66</td>\n",
       "      <td>0.702128</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6cb357ed83d343c59820654649b534c7</td>\n",
       "      <td>gpt-4-1106-preview</td>\n",
       "      <td>llama-3-8b-instruct</td>\n",
       "      <td>model_b</td>\n",
       "      <td>Я хочу чтобы ты разработал стратегию анализа к...</td>\n",
       "      <td>Анализ криптовалютных проектов, которые не пок...</td>\n",
       "      <td>I'd be happy to help you with that!\\n\\nTo deve...</td>\n",
       "      <td>Russian</td>\n",
       "      <td>{'context_a_tokens': 104, 'context_b_tokens': ...</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>{'criteria_v0.1': {'complexity': True, 'creati...</td>\n",
       "      <td>0.693042</td>\n",
       "      <td>1045</td>\n",
       "      <td>1175</td>\n",
       "      <td>0.889362</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10211</th>\n",
       "      <td>77d972487d8649ec8f92f3cc87c97df1</td>\n",
       "      <td>command-r-plus</td>\n",
       "      <td>mistral-large-2402</td>\n",
       "      <td>model_a</td>\n",
       "      <td>create an Epic Rap Battles of History rap batt...</td>\n",
       "      <td>(Intro)\\n\\n[A courtroom setting with Judge Bob...</td>\n",
       "      <td>Verse 1 - Dr. House:\\nYo, it's Dr. House, the ...</td>\n",
       "      <td>English</td>\n",
       "      <td>{'context_a_tokens': 26, 'context_b_tokens': 2...</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>{'criteria_v0.1': {'complexity': False, 'creat...</td>\n",
       "      <td>0.874570</td>\n",
       "      <td>924</td>\n",
       "      <td>411</td>\n",
       "      <td>0.444805</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10212</th>\n",
       "      <td>5c67012d83cd4b97b3be292baa75bab6</td>\n",
       "      <td>gemini-1.5-pro-api-0514</td>\n",
       "      <td>llama-3-70b-instruct</td>\n",
       "      <td>model_a</td>\n",
       "      <td>how to create a weaviate schema of vectors rep...</td>\n",
       "      <td>```python\\nimport weaviate\\n\\nclient = weaviat...</td>\n",
       "      <td>In Weaviate, you can create a schema for vecto...</td>\n",
       "      <td>English</td>\n",
       "      <td>{'context_a_tokens': 17, 'context_b_tokens': 1...</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>{'criteria_v0.1': {'complexity': True, 'creati...</td>\n",
       "      <td>0.797442</td>\n",
       "      <td>644</td>\n",
       "      <td>417</td>\n",
       "      <td>0.647516</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10213</th>\n",
       "      <td>42ef375bbb8a4f59be13d623e42bc42d</td>\n",
       "      <td>yi-1.5-34b-chat</td>\n",
       "      <td>mistral-large-2402</td>\n",
       "      <td>model_b</td>\n",
       "      <td>if a had some popping candy and i put it in so...</td>\n",
       "      <td>Putting popping candy into someone else's mout...</td>\n",
       "      <td>If you put popping candy in someone else's mou...</td>\n",
       "      <td>English</td>\n",
       "      <td>{'context_a_tokens': 24, 'context_b_tokens': 2...</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>{'criteria_v0.1': {'complexity': False, 'creat...</td>\n",
       "      <td>0.742115</td>\n",
       "      <td>453</td>\n",
       "      <td>89</td>\n",
       "      <td>0.196468</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10214</th>\n",
       "      <td>87f2164cc0f0464e948c78d154074457</td>\n",
       "      <td>llama-3-8b-instruct</td>\n",
       "      <td>gemini-1.5-flash-api-0514</td>\n",
       "      <td>model_b</td>\n",
       "      <td>预测如果中国银行被踢出swift系统，中国会发生什么，对中国影响有哪些？\\n</td>\n",
       "      <td>A very interesting and hypothetical question! ...</td>\n",
       "      <td>预测中国银行被踢出SWIFT系统对中国的影响非常复杂，需要考虑多种因素。以下是一些可能的场景...</td>\n",
       "      <td>Chinese</td>\n",
       "      <td>{'context_a_tokens': 35, 'context_b_tokens': 3...</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>{'criteria_v0.1': {'complexity': False, 'creat...</td>\n",
       "      <td>0.811355</td>\n",
       "      <td>607</td>\n",
       "      <td>860</td>\n",
       "      <td>0.705814</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10215</th>\n",
       "      <td>8ed0c49f1edf431b939ddc43499634bd</td>\n",
       "      <td>command-r-plus</td>\n",
       "      <td>gpt-4-0314</td>\n",
       "      <td>model_a</td>\n",
       "      <td>bana thymeleafi öğretir misin</td>\n",
       "      <td>Elbette! Thymeleaf, Java web uygulamaları için...</td>\n",
       "      <td>Tabii ki size Thymeleaf hakkında bilgi verebil...</td>\n",
       "      <td>Khasi</td>\n",
       "      <td>{'context_a_tokens': 12, 'context_b_tokens': 1...</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>{'criteria_v0.1': {'complexity': False, 'creat...</td>\n",
       "      <td>0.887187</td>\n",
       "      <td>1111</td>\n",
       "      <td>831</td>\n",
       "      <td>0.747975</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10216 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                            question_id                   model_a  \\\n",
       "0      f4c6889138ae4867a686df2f35080a3a        mistral-large-2402   \n",
       "1      3c547616a9a0433481a2f32b06311a36  phi-3-medium-4k-instruct   \n",
       "2      efca77bd70ce41c8acb6e806f2509705        gpt-4-1106-preview   \n",
       "3      ae8804ad877941a29412972c07b006ab       llama-3-8b-instruct   \n",
       "4      6cb357ed83d343c59820654649b534c7        gpt-4-1106-preview   \n",
       "...                                 ...                       ...   \n",
       "10211  77d972487d8649ec8f92f3cc87c97df1            command-r-plus   \n",
       "10212  5c67012d83cd4b97b3be292baa75bab6   gemini-1.5-pro-api-0514   \n",
       "10213  42ef375bbb8a4f59be13d623e42bc42d           yi-1.5-34b-chat   \n",
       "10214  87f2164cc0f0464e948c78d154074457       llama-3-8b-instruct   \n",
       "10215  8ed0c49f1edf431b939ddc43499634bd            command-r-plus   \n",
       "\n",
       "                         model_b   winner  \\\n",
       "0              gpt-4o-2024-05-13  model_a   \n",
       "1           llama-3-70b-instruct  model_b   \n",
       "2              gpt-4o-2024-05-13  model_a   \n",
       "3                      command-r  model_a   \n",
       "4            llama-3-8b-instruct  model_b   \n",
       "...                          ...      ...   \n",
       "10211         mistral-large-2402  model_a   \n",
       "10212       llama-3-70b-instruct  model_a   \n",
       "10213         mistral-large-2402  model_b   \n",
       "10214  gemini-1.5-flash-api-0514  model_b   \n",
       "10215                 gpt-4-0314  model_a   \n",
       "\n",
       "                                                  prompt  \\\n",
       "0      I'm a developer building a chrome extension. I...   \n",
       "1      Repeat the question and then answer it: Matteo...   \n",
       "2      Your response must begin with 'なんJ 栗の花大学 生唾芸術C...   \n",
       "3      in angular app, i want to create a dropdown, b...   \n",
       "4      Я хочу чтобы ты разработал стратегию анализа к...   \n",
       "...                                                  ...   \n",
       "10211  create an Epic Rap Battles of History rap batt...   \n",
       "10212  how to create a weaviate schema of vectors rep...   \n",
       "10213  if a had some popping candy and i put it in so...   \n",
       "10214             预测如果中国银行被踢出swift系统，中国会发生什么，对中国影响有哪些？\\n   \n",
       "10215                      bana thymeleafi öğretir misin   \n",
       "\n",
       "                                              response_1  \\\n",
       "0      From your description, it seems like you are t...   \n",
       "1      Question: Matteo has 20 apples and buys 20 ora...   \n",
       "2      なんJ 栗の花大学 生唾芸術CreamPan部\\nたっぷりUnchartedな世界で満足して...   \n",
       "3      ```\\n<div>\\n  <select>\\n    <ng-container *ngF...   \n",
       "4      Анализ криптовалютных проектов, которые не пок...   \n",
       "...                                                  ...   \n",
       "10211  (Intro)\\n\\n[A courtroom setting with Judge Bob...   \n",
       "10212  ```python\\nimport weaviate\\n\\nclient = weaviat...   \n",
       "10213  Putting popping candy into someone else's mout...   \n",
       "10214  A very interesting and hypothetical question! ...   \n",
       "10215  Elbette! Thymeleaf, Java web uygulamaları için...   \n",
       "\n",
       "                                              response_2 language  \\\n",
       "0      The issue seems to be related to how the `chro...  English   \n",
       "1      Here is the question:\\n\\nMatteo has 20 apples,...  English   \n",
       "2      なんJ 栗の花大学 生唾芸術CreamPan部\\nたっぷりUnchartedな世界で満足して...  English   \n",
       "3      Here's an example of how you could create a dr...  English   \n",
       "4      I'd be happy to help you with that!\\n\\nTo deve...  Russian   \n",
       "...                                                  ...      ...   \n",
       "10211  Verse 1 - Dr. House:\\nYo, it's Dr. House, the ...  English   \n",
       "10212  In Weaviate, you can create a schema for vecto...  English   \n",
       "10213  If you put popping candy in someone else's mou...  English   \n",
       "10214  预测中国银行被踢出SWIFT系统对中国的影响非常复杂，需要考虑多种因素。以下是一些可能的场景...  Chinese   \n",
       "10215  Tabii ki size Thymeleaf hakkında bilgi verebil...    Khasi   \n",
       "\n",
       "                                           conv_metadata  is_code  ...  \\\n",
       "0      {'context_a_tokens': 191, 'context_b_tokens': ...     True  ...   \n",
       "1      {'context_a_tokens': 51, 'context_b_tokens': 5...    False  ...   \n",
       "2      {'context_a_tokens': 665, 'context_b_tokens': ...    False  ...   \n",
       "3      {'context_a_tokens': 48, 'context_b_tokens': 4...     True  ...   \n",
       "4      {'context_a_tokens': 104, 'context_b_tokens': ...    False  ...   \n",
       "...                                                  ...      ...  ...   \n",
       "10211  {'context_a_tokens': 26, 'context_b_tokens': 2...    False  ...   \n",
       "10212  {'context_a_tokens': 17, 'context_b_tokens': 1...     True  ...   \n",
       "10213  {'context_a_tokens': 24, 'context_b_tokens': 2...    False  ...   \n",
       "10214  {'context_a_tokens': 35, 'context_b_tokens': 3...    False  ...   \n",
       "10215  {'context_a_tokens': 12, 'context_b_tokens': 1...     True  ...   \n",
       "\n",
       "                                            category_tag       sim  length_a  \\\n",
       "0      {'criteria_v0.1': {'complexity': True, 'creati...  0.827569       546   \n",
       "1      {'criteria_v0.1': {'complexity': True, 'creati...  0.958751       211   \n",
       "2      {'criteria_v0.1': {'complexity': False, 'creat...  0.841135      1077   \n",
       "3      {'criteria_v0.1': {'complexity': True, 'creati...  0.723722        94   \n",
       "4      {'criteria_v0.1': {'complexity': True, 'creati...  0.693042      1045   \n",
       "...                                                  ...       ...       ...   \n",
       "10211  {'criteria_v0.1': {'complexity': False, 'creat...  0.874570       924   \n",
       "10212  {'criteria_v0.1': {'complexity': True, 'creati...  0.797442       644   \n",
       "10213  {'criteria_v0.1': {'complexity': False, 'creat...  0.742115       453   \n",
       "10214  {'criteria_v0.1': {'complexity': False, 'creat...  0.811355       607   \n",
       "10215  {'criteria_v0.1': {'complexity': False, 'creat...  0.887187      1111   \n",
       "\n",
       "       length_b  token_diff  longer_won hard_prompt  easy_prompt  if_prompt  \\\n",
       "0           380    0.695971        True        True        False      False   \n",
       "1           246    0.857724        True        True        False       True   \n",
       "2           964    0.895079        True        True        False       True   \n",
       "3            66    0.702128        True        True        False       True   \n",
       "4          1175    0.889362        True        True        False       True   \n",
       "...         ...         ...         ...         ...          ...        ...   \n",
       "10211       411    0.444805        True       False        False      False   \n",
       "10212       417    0.647516        True       False        False      False   \n",
       "10213        89    0.196468       False       False        False      False   \n",
       "10214       860    0.705814        True       False        False      False   \n",
       "10215       831    0.747975        True       False        False      False   \n",
       "\n",
       "       math_prompt  \n",
       "0            False  \n",
       "1             True  \n",
       "2            False  \n",
       "3            False  \n",
       "4            False  \n",
       "...            ...  \n",
       "10211        False  \n",
       "10212        False  \n",
       "10213        False  \n",
       "10214        False  \n",
       "10215        False  \n",
       "\n",
       "[10216 rows x 21 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_json(\"../dataset/PPE_HF.json\", lines=True)\n",
    "# unclear of classification"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4f7551c2",
   "metadata": {},
   "source": [
    "# reward-bench 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6b04039",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating test split: 100%|██████████| 1865/1865 [00:00<00:00, 7437.09 examples/s]\n",
      "Filter: 100%|██████████| 1865/1865 [00:00<00:00, 11469.00 examples/s]\n",
      "Creating json from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 10.80ba/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "14267358"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds = load_dataset('allenai/reward-bench-2', split='test')\n",
    "ds = ds.filter(lambda ex: ex[\"subset\"]!='Ties')\n",
    "ds.to_json(\"../dataset/reward_bench_v2.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "743b8057",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Factuality', 'Focus', 'Math', 'Precise IF', 'Safety'}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import Dataset\n",
    "ds = Dataset.from_json(\"../dataset/reward_bench_v2.json\")\n",
    "set(ds['subset'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71abee5d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 18.62ba/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "5211844"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "ds = load_dataset('allenai/reward-bench', split='filtered')\n",
    "\n",
    "subset_to_category = {\n",
    "    # ===== Math =====\n",
    "    \"math-prm\": \"math\",\n",
    "\n",
    "    # ===== Coding =====\n",
    "    \"hep-cpp\": \"coding\",\n",
    "    \"hep-go\": \"coding\",\n",
    "    \"hep-java\": \"coding\",\n",
    "    \"hep-js\": \"coding\",\n",
    "    \"hep-python\": \"coding\",\n",
    "    \"hep-rust\": \"coding\",\n",
    "\n",
    "    # ===== Chat =====\n",
    "    \"alpacaeval-easy\": \"chat\",\n",
    "    \"alpacaeval-length\": \"chat\",\n",
    "    \"alpacaeval-hard\": \"chat\",\n",
    "    \"mt-bench-easy\": \"chat\",\n",
    "    \"mt-bench-medium\": \"chat\",\n",
    "    \"mt-bench-hard\": \"chat\",\n",
    "    \"llmbar-natural\": \"chat\",\n",
    "    \"llmbar-adver-neighbor\": \"chat\",\n",
    "    \"llmbar-adver-GPTInst\": \"chat\",\n",
    "    \"llmbar-adver-GPTOut\": \"chat\",\n",
    "    \"llmbar-adver-manual\": \"chat\",\n",
    "\n",
    "    # ===== Safety =====\n",
    "    \"refusals-dangerous\": \"safety\",\n",
    "    \"refusals-offensive\": \"safety\",\n",
    "    \"xstest-should-refuse\": \"safety\",\n",
    "    \"xstest-should-respond\": \"safety\",\n",
    "    \"do-not-answer\": \"safety\",\n",
    "}\n",
    "\n",
    "ds_with_category = ds.map(\n",
    "    lambda x: {\"category\": subset_to_category.get(x[\"subset\"], \"unknown\")}\n",
    ")\n",
    "\n",
    "ds_with_category\n",
    "ds_with_category.to_json(\"../dataset/reward_bench.json\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "614ffd55",
   "metadata": {},
   "source": [
    "# judgebench "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d139697f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating claude split: 100%|██████████| 270/270 [00:00<00:00, 11719.81 examples/s]\n",
      "Generating gpt split: 100%|██████████| 350/350 [00:00<00:00, 8612.18 examples/s]\n",
      "Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 17.95ba/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'livebench-math',\n",
       " 'livebench-reasoning',\n",
       " 'livecodebench',\n",
       " 'mmlu-pro-biology',\n",
       " 'mmlu-pro-business',\n",
       " 'mmlu-pro-chemistry',\n",
       " 'mmlu-pro-computer science',\n",
       " 'mmlu-pro-economics',\n",
       " 'mmlu-pro-engineering',\n",
       " 'mmlu-pro-health',\n",
       " 'mmlu-pro-history',\n",
       " 'mmlu-pro-law',\n",
       " 'mmlu-pro-math',\n",
       " 'mmlu-pro-other',\n",
       " 'mmlu-pro-philosophy',\n",
       " 'mmlu-pro-physics',\n",
       " 'mmlu-pro-psychology'}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import concatenate_datasets, load_dataset\n",
    "gpt_data = load_dataset(\"ScalerLab/JudgeBench\", split=\"gpt\")\n",
    "claude_data = load_dataset(\"ScalerLab/JudgeBench\", split=\"claude\")\n",
    "combined_dataset = concatenate_datasets([gpt_data, claude_data])    \n",
    "combined_dataset.to_json(\"../dataset/judgebench.json\")\n",
    "\n",
    "set(combined_dataset['source'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30fcaa5c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'livebench-math',\n",
       " 'livebench-reasoning',\n",
       " 'livecodebench',\n",
       " 'mmlu-pro-biology',\n",
       " 'mmlu-pro-business',\n",
       " 'mmlu-pro-chemistry',\n",
       " 'mmlu-pro-computer science',\n",
       " 'mmlu-pro-economics',\n",
       " 'mmlu-pro-engineering',\n",
       " 'mmlu-pro-health',\n",
       " 'mmlu-pro-history',\n",
       " 'mmlu-pro-law',\n",
       " 'mmlu-pro-math',\n",
       " 'mmlu-pro-other',\n",
       " 'mmlu-pro-philosophy',\n",
       " 'mmlu-pro-physics',\n",
       " 'mmlu-pro-psychology'}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds = Dataset.from_json(\"../dataset/judgebench.json\")\n",
    "set(ds['source'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5cdb8323",
   "metadata": {},
   "source": [
    "# helpsteer3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea6e94f3",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating train split: 100%|██████████| 38459/38459 [00:04<00:00, 7843.04 examples/s] \n",
      "Generating validation split: 100%|██████████| 2017/2017 [00:00<00:00, 7724.30 examples/s]\n",
      "Filter: 100%|██████████| 38459/38459 [00:03<00:00, 10326.28 examples/s]\n",
      "Filter: 100%|██████████| 2017/2017 [00:00<00:00, 10423.89 examples/s]\n",
      "Filter: 100%|██████████| 18568/18568 [00:01<00:00, 17587.09 examples/s]\n",
      "Filter: 100%|██████████| 960/960 [00:00<00:00, 15018.54 examples/s]\n",
      "Flattening the indices: 100%|██████████| 17419/17419 [00:04<00:00, 3549.52 examples/s]\n",
      "Flattening the indices: 100%|██████████| 903/903 [00:00<00:00, 7432.72 examples/s]\n",
      "Creating json from Arrow format: 100%|██████████| 19/19 [00:05<00:00,  3.46ba/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "240586771"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data = load_dataset(\"nvidia/HelpSteer3\", name=\"preference\", split=\"train\")\n",
    "test_data  = load_dataset(\"nvidia/HelpSteer3\", name=\"preference\", split=\"validation\")\n",
    "\n",
    "train_data = train_data.filter(lambda x: len(x[\"context\"])>1)\n",
    "test_data = test_data.filter(lambda x: len(x[\"context\"])>1)\n",
    "\n",
    "train_data = train_data.filter(lambda ex: ex[\"overall_preference\"]!=0)\n",
    "test_data = test_data.filter(lambda ex: ex[\"overall_preference\"]!=0)\n",
    "\n",
    "train_data = train_data.add_column(\"split\", [\"train\"] * len(train_data))\n",
    "test_data  = test_data.add_column(\"split\", [\"test\"] * len(test_data))\n",
    "\n",
    "ds = concatenated_data = concatenate_datasets([train_data, test_data])\n",
    "ds.to_json(\"../dataset/helpsteer3.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89b81c18",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'code', 'general', 'multilingual', 'stem'}"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds = Dataset.from_json(\"../dataset/helpsteer3.json\")\n",
    "set(ds['domain'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "85fe0f9d",
   "metadata": {},
   "source": [
    "# construct skyword preference "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3248f092",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating train split: 100%|██████████| 77016/77016 [00:06<00:00, 11935.97 examples/s]\n",
      "Filter: 100%|██████████| 77016/77016 [00:03<00:00, 19789.54 examples/s]\n",
      "Creating json from Arrow format: 100%|██████████| 30/30 [00:01<00:00, 15.42ba/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "207392350"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds = load_dataset(\"Skywork/Skywork-Reward-Preference-80K-v0.2\", split=\"train\")\n",
    "ds = ds.filter(lambda x: x['source'] == 'magpie_pro_llama3.1')\n",
    "ds.to_json(\"../dataset/magpie_pro.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "defeb022",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating train split: 29682 examples [00:01, 14889.44 examples/s] \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['chosen', 'rejected', 'source'],\n",
       "    num_rows: 29682\n",
       "})"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds = Dataset.from_json(\"../dataset/magpie_pro.json\")\n",
    "ds\n",
    "# no subset classified"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10fa149b",
   "metadata": {},
   "source": [
    "# construct mixture "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1e2a87c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating train split: 100%|██████████| 10795/10795 [00:00<00:00, 28472.97 examples/s]\n",
      "Generating train split: 100%|██████████| 54024/54024 [00:03<00:00, 13717.66 examples/s]\n",
      "Creating json from Arrow format: 100%|██████████| 21/21 [00:03<00:00,  5.99ba/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "55886596"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds_math = load_dataset(\"xinlai/Math-Step-DPO-10K\", split=\"train\")\n",
    "ds_math = ds_math.select_columns([\"prompt\", \"full_chosen\",\"full_rejected\"])\n",
    "ds_math = ds_math.rename_columns({\n",
    "    \"full_chosen\": \"chosen\",\n",
    "    \"full_rejected\": \"rejected\"\n",
    "})\n",
    "ds_code = load_dataset(\"Vezora/Code-Preference-Pairs\", split=\"train\").shuffle(seed=42).select(range(10000))\n",
    "ds_code = ds_code.select_columns([\"input\", \"accepted\",\"rejected\"])\n",
    "ds_code = ds_code.rename_columns({\"input\": \"prompt\", \"accepted\": \"chosen\",\"rejected\": \"rejected\"})\n",
    "ds = concatenate_datasets([ds_math, ds_code])\n",
    "# ds = ds.filter(lambda x: x['source'] == 'magpie_pro_llama3.1')\n",
    "ds.to_json(\"../dataset/mixture.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dbcd6f85",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating train split: 20795 examples [00:00, 37353.59 examples/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['prompt', 'chosen', 'rejected'],\n",
       "    num_rows: 20795\n",
       "})"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds = Dataset.from_json(\"../dataset/mixture.json\")\n",
    "ds\n",
    "# no subset classified"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c591daa",
   "metadata": {},
   "source": [
    "# rm bench"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8ac7387",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "ds = load_dataset('THU-KEG/RM-Bench', split='train', token=\"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\")\n",
    "ds.to_json(\"../dataset/rm_bench.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0413e918",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'chat', 'code', 'math', 'safety-refuse', 'safety-response'}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(ds['domain'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "72522735",
   "metadata": {},
   "source": [
    "## Skywork-Reward-Preference-80K-v0.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4673a585",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "ds = load_dataset('Skywork/Skywork-Reward-Preference-80K-v0.2', split='train', token=\"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\")\n",
    "ds.to_json(\"../dataset/skywork.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bc0c4736",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['chosen', 'rejected', 'source'],\n",
       "    num_rows: 77016\n",
       "})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rrm-vllm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
