{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading data: 100%|██████████| 4.62k/4.62k [00:01<00:00, 4.14kB/s]\n",
      "Generating test split: 100%|██████████| 8/8 [00:00<00:00, 933.60 examples/s]\n"
     ]
    }
   ],
   "source": [
    "import datasets\n",
    "\n",
    "data = datasets.load_dataset(\"lmms-lab/LiveBenchResults\", \"2024-07\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = data[\"test\"].to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(\"2024-07.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"2024-07.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Model Name</th>\n",
       "      <th>Total</th>\n",
       "      <th>Concrete Recognition</th>\n",
       "      <th>Contextual Analysis</th>\n",
       "      <th>Deeper Implications</th>\n",
       "      <th>Broader Implications</th>\n",
       "      <th>Further Insights</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>gpt-4o-mini</td>\n",
       "      <td>86.240000</td>\n",
       "      <td>89.0</td>\n",
       "      <td>81.0</td>\n",
       "      <td>89.6</td>\n",
       "      <td>87.600000</td>\n",
       "      <td>84.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>gemini-1.5-flash</td>\n",
       "      <td>80.760000</td>\n",
       "      <td>84.4</td>\n",
       "      <td>69.2</td>\n",
       "      <td>81.7</td>\n",
       "      <td>90.300000</td>\n",
       "      <td>78.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>gpt-4o</td>\n",
       "      <td>91.300000</td>\n",
       "      <td>92.2</td>\n",
       "      <td>83.4</td>\n",
       "      <td>94.1</td>\n",
       "      <td>94.400000</td>\n",
       "      <td>92.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>gemini-1.5-pro</td>\n",
       "      <td>86.280000</td>\n",
       "      <td>90.6</td>\n",
       "      <td>76.6</td>\n",
       "      <td>85.6</td>\n",
       "      <td>91.600000</td>\n",
       "      <td>87.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>llama3-llava-next-8b</td>\n",
       "      <td>62.650602</td>\n",
       "      <td>60.1</td>\n",
       "      <td>61.4</td>\n",
       "      <td>74.8</td>\n",
       "      <td>63.673469</td>\n",
       "      <td>53.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>llava-1.5-7b</td>\n",
       "      <td>41.940000</td>\n",
       "      <td>38.6</td>\n",
       "      <td>34.5</td>\n",
       "      <td>58.8</td>\n",
       "      <td>42.800000</td>\n",
       "      <td>35.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Idefics2-8B</td>\n",
       "      <td>25.860000</td>\n",
       "      <td>18.0</td>\n",
       "      <td>16.3</td>\n",
       "      <td>43.8</td>\n",
       "      <td>27.000000</td>\n",
       "      <td>24.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>InternVL2-2B</td>\n",
       "      <td>56.840000</td>\n",
       "      <td>65.8</td>\n",
       "      <td>49.9</td>\n",
       "      <td>64.2</td>\n",
       "      <td>55.800000</td>\n",
       "      <td>48.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Model Name      Total  Concrete Recognition  Contextual Analysis  \\\n",
       "0           gpt-4o-mini  86.240000                 89.0                 81.0   \n",
       "1      gemini-1.5-flash  80.760000                 84.4                 69.2   \n",
       "2                gpt-4o  91.300000                 92.2                 83.4   \n",
       "3        gemini-1.5-pro  86.280000                 90.6                 76.6   \n",
       "4  llama3-llava-next-8b  62.650602                 60.1                 61.4   \n",
       "5          llava-1.5-7b  41.940000                 38.6                 34.5   \n",
       "6           Idefics2-8B  25.860000                 18.0                 16.3   \n",
       "7          InternVL2-2B  56.840000                 65.8                 49.9   \n",
       "\n",
       "   Deeper Implications  Broader Implications  Further Insights  \n",
       "0                 89.6             87.600000              84.0  \n",
       "1                 81.7             90.300000              78.2  \n",
       "2                 94.1             94.400000              92.4  \n",
       "3                 85.6             91.600000              87.0  \n",
       "4                 74.8             63.673469              53.3  \n",
       "5                 58.8             42.800000              35.0  \n",
       "6                 43.8             27.000000              24.2  \n",
       "7                 64.2             55.800000              48.5  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = datasets.Dataset.from_pandas(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 473.45ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.17s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBenchResults/commit/a29f8ecb399dbd7ab7475f0de2c48ee54affbff9', commit_message='Upload dataset', commit_description='', oid='a29f8ecb399dbd7ab7475f0de2c48ee54affbff9', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.push_to_hub(\"lmms-lab/LiveBenchResults\", \"2024-07\", split=\"test\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = datasets.load_dataset(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"Idefics2_8B\"] = data[\"idefics2\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    gpt_4o_mini: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "    gemini_1.5_flash: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "    gpt_4o: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "    gemini_1.5_pro: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "    llama3_llava_next_8b: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "    llava_1.5_7b: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "    idefics2: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "    InternVL2_2B: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "    Idefics2_8B: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_data = {}\n",
    "for k, v in data.items():\n",
    "    if k == \"idefics2\":\n",
    "        continue\n",
    "    new_data[k] = v\n",
    "data = datasets.DatasetDict(new_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    gpt_4o_mini: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "    gemini_1.5_flash: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "    gpt_4o: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "    gemini_1.5_pro: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "    llama3_llava_next_8b: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "    llava_1.5_7b: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "    InternVL2_2B: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "    Idefics2_8B: Dataset({\n",
       "        features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n",
       "        num_rows: 250\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map: 100%|██████████| 250/250 [00:00<00:00, 347.35 examples/s]it/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.58ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]\n",
      "Map: 100%|██████████| 250/250 [00:00<00:00, 363.40 examples/s]it/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.70ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.59s/it]\n",
      "Map: 100%|██████████| 250/250 [00:00<00:00, 472.60 examples/s]it/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.62ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.43s/it]\n",
      "Map: 100%|██████████| 250/250 [00:00<00:00, 352.11 examples/s]it/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 11.55ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]\n",
      "Map: 100%|██████████| 250/250 [00:00<00:00, 475.90 examples/s]it/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 11.38ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.46s/it]\n",
      "Map: 100%|██████████| 250/250 [00:00<00:00, 364.89 examples/s]it/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 10.94ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.59s/it]\n",
      "Map: 100%|██████████| 250/250 [00:00<00:00, 529.96 examples/s]it/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 13.51ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.33s/it]\n",
      "Map: 100%|██████████| 250/250 [00:00<00:00, 349.67 examples/s]it/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.74ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.57s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBenchDetailedResults/commit/047d6dc66759e0a8b57b4e6015db6208da1cd4da', commit_message='Upload dataset', commit_description='', oid='047d6dc66759e0a8b57b4e6015db6208da1cd4da', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.push_to_hub(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "live_bench",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
