{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = load_dataset(\"lmms-lab/LiveBench\", \"2024-06\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = data[\"test\"].to_pandas()\n",
    "df = df[df[\"checker\"].notna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.iloc[620][\"checker\"] is None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "subtask\n",
       "Contextual Analysis     477\n",
       "Deeper Implications     132\n",
       "Basic Understanding     118\n",
       "Further Insights         63\n",
       "Broader Implications     52\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"subtask\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.sample(frac=1).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_303711/5529174.py:1: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
      "  top_50_per_subtask = df.groupby('subtask').apply(lambda x: x.nlargest(50, 'score'))\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "250"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "top_50_per_subtask = df.groupby(\"subtask\").apply(lambda x: x.nlargest(50, \"score\"))\n",
    "top_50_per_subtask.reset_index(drop=True, inplace=True)\n",
    "len(top_50_per_subtask)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "subtask\n",
       "Basic Understanding     50\n",
       "Broader Implications    50\n",
       "Contextual Analysis     50\n",
       "Deeper Implications     50\n",
       "Further Insights        50\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "top_50_per_subtask[\"subtask\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9.276"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "np.mean(top_50_per_subtask[\"score\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset, Features\n",
    "import datasets\n",
    "\n",
    "\n",
    "def gen():\n",
    "    for d in top_50_per_subtask:\n",
    "        yield d\n",
    "\n",
    "\n",
    "data = Dataset.from_pandas(\n",
    "    top_50_per_subtask,\n",
    "    features=Features(\n",
    "        {\n",
    "            \"id\": datasets.Value(\"int32\"),\n",
    "            \"images\": datasets.Sequence(datasets.Image()),\n",
    "            \"website\": datasets.Value(\"string\"),\n",
    "            \"question\": datasets.Value(\"string\"),\n",
    "            \"answer\": datasets.Value(\"string\"),\n",
    "            \"criteria\": datasets.Value(\"string\"),\n",
    "            \"subtask\": datasets.Value(\"string\"),\n",
    "            \"data_generator\": datasets.Value(\"string\"),\n",
    "            \"checker\": datasets.Value(\"string\"),\n",
    "            \"date_time\": datasets.Value(\"string\"),\n",
    "            \"screen_shoter\": datasets.Value(\"string\"),\n",
    "            \"screen_size\": datasets.Value(\"string\"),\n",
    "            \"score\": datasets.Value(\"int32\"),\n",
    "            \"reason\": datasets.Value(\"string\"),\n",
    "            \"scorer_name\": datasets.Value(\"string\"),\n",
    "        }\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['id', 'images', 'website', 'question', 'answer', 'criteria', 'subtask', 'data_generator', 'checker', 'date_time', 'screen_shoter', 'screen_size', 'score', 'reason', 'scorer_name'],\n",
       "    num_rows: 250\n",
       "})"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': Value(dtype='int32', id=None),\n",
       " 'images': Sequence(feature=Image(mode=None, decode=True, id=None), length=-1, id=None),\n",
       " 'website': Value(dtype='string', id=None),\n",
       " 'question': Value(dtype='string', id=None),\n",
       " 'answer': Value(dtype='string', id=None),\n",
       " 'criteria': Value(dtype='string', id=None),\n",
       " 'subtask': Value(dtype='string', id=None),\n",
       " 'data_generator': Value(dtype='string', id=None),\n",
       " 'checker': Value(dtype='string', id=None),\n",
       " 'date_time': Value(dtype='string', id=None),\n",
       " 'screen_shoter': Value(dtype='string', id=None),\n",
       " 'screen_size': Value(dtype='string', id=None),\n",
       " 'score': Value(dtype='int32', id=None),\n",
       " 'reason': Value(dtype='string', id=None),\n",
       " 'scorer_name': Value(dtype='string', id=None)}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map: 100%|██████████| 250/250 [00:00<00:00, 273.33 examples/s]it/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00,  4.98ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:18<00:00, 18.34s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBench/commit/89ce4e07cf0988b2305b7227ac2677d481e6112a', commit_message='Upload dataset', commit_description='', oid='89ce4e07cf0988b2305b7227ac2677d481e6112a', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-06\", split=\"test\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "lmms-eval",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
