{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import os\n",
    "import re\n",
    "from datetime import datetime\n",
    "import glob\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### download hf leaderboard history"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/homebrew/Caskroom/miniforge/base/envs/elo/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Fetching 227 files: 100%|██████████| 227/227 [00:07<00:00, 28.93it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'/Users/shivalika/workspace/chatbot_arena/chatbot-arena-study/lm_elo/analysis/leaderboard_results'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from huggingface_hub import snapshot_download\n",
    "\n",
    "snapshot_download(repo_id=\"lmarena-ai/chatbot-arena-leaderboard\",  repo_type=\"space\", local_dir=\"./leaderboard_results\") #allow_patterns=\"*.csv\","
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### read in data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing file: leaderboard_results/elo_results_20240706.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250105.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241222.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241023.pkl\n",
      "Processing file: leaderboard_results/elo_results_20230802.pkl\n",
      "Processing file: leaderboard_results/elo_results_20230619.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250311.pkl\n",
      "Processing file: leaderboard_results/elo_results_20231116.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240927.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240516.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250315.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241218.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241230.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240305.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240501.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240515.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240313.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240307.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250128.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250317.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250303.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250115.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240716.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240215.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240413.pkl\n",
      "Processing file: leaderboard_results/elo_results_20231215.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240611.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250416.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240822.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240823.pkl\n",
      "Processing file: leaderboard_results/elo_results_20231002.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241122.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250205.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250211.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241120.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240202.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240410.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240606.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240411.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241121.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250206.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240617.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250405.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240602.pkl\n",
      "Processing file: leaderboard_results/elo_results_20231206.pkl\n",
      "Processing file: leaderboard_results/elo_results_20230905.pkl\n",
      "Processing file: leaderboard_results/elo_results_20230508.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250217.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250203.pkl\n",
      "Processing file: leaderboard_results/elo_results_20230522.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240827.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240629.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240403.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250214.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240426.pkl\n",
      "Processing file: leaderboard_results/elo_results_20231220.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250423.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240419.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250408.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240828.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240801.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240626.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250409.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240418.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250227.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241104.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240805.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240623.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240409.pkl\n",
      "Processing file: leaderboard_results/elo_results_20230717.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241113.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240621.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240806.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240813.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240422.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241112.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250209.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250221.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240519.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241015.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250124.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240109.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250326.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241028.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250119.pkl\n",
      "Processing file: leaderboard_results/elo_results_20231108.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240915.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240730.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240917.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250325.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250330.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241215.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241201.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240527.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240725.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240731.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250122.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241007.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250320.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241205.pkl\n",
      "Processing file: leaderboard_results/elo_results_20241210.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240708.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240722.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240520.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240508.pkl\n",
      "Processing file: leaderboard_results/elo_results_20250121.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240326.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240118.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240125.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240327.pkl\n",
      "Processing file: leaderboard_results/elo_results_20240904.pkl\n",
      "Found 111 dated ELO result files:\n",
      "                                        file_path formatted_date\n",
      "0    leaderboard_results/elo_results_20240706.pkl     2024-07-06\n",
      "1    leaderboard_results/elo_results_20250105.pkl     2025-01-05\n",
      "2    leaderboard_results/elo_results_20241222.pkl     2024-12-22\n",
      "3    leaderboard_results/elo_results_20241023.pkl     2024-10-23\n",
      "4    leaderboard_results/elo_results_20230802.pkl     2023-08-02\n",
      "..                                            ...            ...\n",
      "106  leaderboard_results/elo_results_20240326.pkl     2024-03-26\n",
      "107  leaderboard_results/elo_results_20240118.pkl     2024-01-18\n",
      "108  leaderboard_results/elo_results_20240125.pkl     2024-01-25\n",
      "109  leaderboard_results/elo_results_20240327.pkl     2024-03-27\n",
      "110  leaderboard_results/elo_results_20240904.pkl     2024-09-04\n",
      "\n",
      "[111 rows x 2 columns]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>file_path</th>\n",
       "      <th>formatted_date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>leaderboard_results/elo_results_20240706.pkl</td>\n",
       "      <td>2024-07-06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>leaderboard_results/elo_results_20250105.pkl</td>\n",
       "      <td>2025-01-05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>leaderboard_results/elo_results_20241222.pkl</td>\n",
       "      <td>2024-12-22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>leaderboard_results/elo_results_20241023.pkl</td>\n",
       "      <td>2024-10-23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>leaderboard_results/elo_results_20230802.pkl</td>\n",
       "      <td>2023-08-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>leaderboard_results/elo_results_20240326.pkl</td>\n",
       "      <td>2024-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107</th>\n",
       "      <td>leaderboard_results/elo_results_20240118.pkl</td>\n",
       "      <td>2024-01-18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>108</th>\n",
       "      <td>leaderboard_results/elo_results_20240125.pkl</td>\n",
       "      <td>2024-01-25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>leaderboard_results/elo_results_20240327.pkl</td>\n",
       "      <td>2024-03-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>leaderboard_results/elo_results_20240904.pkl</td>\n",
       "      <td>2024-09-04</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>111 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        file_path formatted_date\n",
       "0    leaderboard_results/elo_results_20240706.pkl     2024-07-06\n",
       "1    leaderboard_results/elo_results_20250105.pkl     2025-01-05\n",
       "2    leaderboard_results/elo_results_20241222.pkl     2024-12-22\n",
       "3    leaderboard_results/elo_results_20241023.pkl     2024-10-23\n",
       "4    leaderboard_results/elo_results_20230802.pkl     2023-08-02\n",
       "..                                            ...            ...\n",
       "106  leaderboard_results/elo_results_20240326.pkl     2024-03-26\n",
       "107  leaderboard_results/elo_results_20240118.pkl     2024-01-18\n",
       "108  leaderboard_results/elo_results_20240125.pkl     2024-01-25\n",
       "109  leaderboard_results/elo_results_20240327.pkl     2024-03-27\n",
       "110  leaderboard_results/elo_results_20240904.pkl     2024-09-04\n",
       "\n",
       "[111 rows x 2 columns]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Get all pickle files in the elo_results directory\n",
    "elo_files = glob.glob(\"leaderboard_results/*.pkl\")\n",
    "\n",
    "# Extract dates from filenames using regex\n",
    "date_pattern = re.compile(r'elo_results_(\\d{8})\\.pkl')\n",
    "dates = []\n",
    "\n",
    "for file_path in elo_files:\n",
    "    print(f\"Processing file: {file_path}\")\n",
    "    filename = os.path.basename(file_path)\n",
    "    match = date_pattern.search(filename)\n",
    "    if match:\n",
    "        date_str = match.group(1)\n",
    "        # Convert YYYYMMDD format to datetime object\n",
    "        date_obj = datetime.strptime(date_str, '%Y%m%d')\n",
    "        dates.append({\n",
    "            'file_path': file_path,\n",
    "            'formatted_date': date_obj.strftime('%Y-%m-%d')\n",
    "        })\n",
    "\n",
    "date_df = pd.DataFrame(dates)\n",
    "\n",
    "# date_df = date_df.sort_values('date')\n",
    "print(f\"Found {len(date_df)} dated ELO result files:\")\n",
    "print(date_df[['file_path', 'formatted_date']])\n",
    "date_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Added results from 2024-07-06\n",
      "Added results from 2025-01-05\n",
      "Added results from 2024-12-22\n",
      "Added results from 2024-10-23\n",
      "Added results from 2025-03-11\n",
      "Added results from 2024-09-27\n",
      "Added results from 2024-05-16\n",
      "Added results from 2025-03-15\n",
      "Added results from 2024-12-18\n",
      "Added results from 2024-12-30\n",
      "Added results from 2024-03-05\n",
      "Added results from 2024-05-01\n",
      "Added results from 2024-05-15\n",
      "Added results from 2024-03-13\n",
      "Added results from 2024-03-07\n",
      "Added results from 2025-01-28\n",
      "Added results from 2025-03-17\n",
      "Added results from 2025-03-03\n",
      "Added results from 2025-01-15\n",
      "Added results from 2024-07-16\n",
      "Added results from 2024-02-15\n",
      "Added results from 2024-04-13\n",
      "Added results from 2024-06-11\n",
      "Added results from 2025-04-16\n",
      "Added results from 2024-08-22\n",
      "Added results from 2024-08-23\n",
      "Added results from 2024-11-22\n",
      "Added results from 2025-02-05\n",
      "Added results from 2025-02-11\n",
      "Added results from 2024-11-20\n",
      "Added results from 2024-02-02\n",
      "Added results from 2024-04-10\n",
      "Added results from 2024-06-06\n",
      "Added results from 2024-04-11\n",
      "Added results from 2024-11-21\n",
      "Added results from 2025-02-06\n",
      "Added results from 2024-06-17\n",
      "Added results from 2025-04-05\n",
      "Added results from 2024-06-02\n",
      "Added results from 2025-02-17\n",
      "Added results from 2025-02-03\n",
      "Added results from 2024-08-27\n",
      "Added results from 2024-06-29\n",
      "Added results from 2024-04-03\n",
      "Added results from 2025-02-14\n",
      "Added results from 2024-04-26\n",
      "Added results from 2025-04-23\n",
      "Added results from 2024-04-19\n",
      "Added results from 2025-04-08\n",
      "Added results from 2024-08-28\n",
      "Added results from 2024-08-01\n",
      "Added results from 2024-06-26\n",
      "Added results from 2025-04-09\n",
      "Added results from 2024-04-18\n",
      "Added results from 2025-02-27\n",
      "Added results from 2024-11-04\n",
      "Added results from 2024-08-05\n",
      "Added results from 2024-06-23\n",
      "Added results from 2024-04-09\n",
      "Added results from 2024-11-13\n",
      "Added results from 2024-06-21\n",
      "Added results from 2024-08-06\n",
      "Added results from 2024-08-13\n",
      "Added results from 2024-04-22\n",
      "Added results from 2024-11-12\n",
      "Added results from 2025-02-09\n",
      "Added results from 2025-02-21\n",
      "Added results from 2024-05-19\n",
      "Added results from 2024-10-15\n",
      "Added results from 2025-01-24\n",
      "Added results from 2024-01-09\n",
      "Added results from 2025-03-26\n",
      "Added results from 2024-10-28\n",
      "Added results from 2025-01-19\n",
      "Added results from 2024-09-15\n",
      "Added results from 2024-07-30\n",
      "Added results from 2024-09-17\n",
      "Added results from 2025-03-25\n",
      "Added results from 2025-03-30\n",
      "Added results from 2024-12-15\n",
      "Added results from 2024-12-01\n",
      "Added results from 2024-05-27\n",
      "Added results from 2024-07-25\n",
      "Added results from 2024-07-31\n",
      "Added results from 2025-01-22\n",
      "Added results from 2024-10-07\n",
      "Added results from 2025-03-20\n",
      "Added results from 2024-12-05\n",
      "Added results from 2024-12-10\n",
      "Added results from 2024-07-08\n",
      "Added results from 2024-07-22\n",
      "Added results from 2024-05-20\n",
      "Added results from 2024-05-08\n",
      "Added results from 2025-01-21\n",
      "Added results from 2024-03-26\n",
      "Added results from 2024-01-18\n",
      "Added results from 2024-01-25\n",
      "Added results from 2024-03-27\n",
      "Added results from 2024-09-04\n",
      "Created merged results dataframe with 14305 rows for dates after 2023-12-20 00:00:00\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>rating</th>\n",
       "      <th>variance</th>\n",
       "      <th>rating_q975</th>\n",
       "      <th>rating_q025</th>\n",
       "      <th>num_battles</th>\n",
       "      <th>final_ranking</th>\n",
       "      <th>date</th>\n",
       "      <th>datetime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"5\" valign=\"top\">2024-07-06</th>\n",
       "      <th>RWKV-4-Raven-14B</th>\n",
       "      <td>921.404758</td>\n",
       "      <td>19.994783</td>\n",
       "      <td>931.690090</td>\n",
       "      <td>913.823607</td>\n",
       "      <td>4943</td>\n",
       "      <td>105.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>alpaca-13b</th>\n",
       "      <td>901.016508</td>\n",
       "      <td>21.937902</td>\n",
       "      <td>909.301810</td>\n",
       "      <td>891.855328</td>\n",
       "      <td>5876</td>\n",
       "      <td>109.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bard-jan-24-gemini-pro</th>\n",
       "      <td>1208.468423</td>\n",
       "      <td>9.904404</td>\n",
       "      <td>1213.746117</td>\n",
       "      <td>1201.453472</td>\n",
       "      <td>11830</td>\n",
       "      <td>12.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>chatglm-6b</th>\n",
       "      <td>878.863152</td>\n",
       "      <td>25.038000</td>\n",
       "      <td>889.146926</td>\n",
       "      <td>870.238677</td>\n",
       "      <td>4998</td>\n",
       "      <td>110.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>chatglm2-6b</th>\n",
       "      <td>923.934417</td>\n",
       "      <td>27.389251</td>\n",
       "      <td>933.436305</td>\n",
       "      <td>914.736414</td>\n",
       "      <td>2709</td>\n",
       "      <td>105.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"5\" valign=\"top\">2024-09-04</th>\n",
       "      <th>yi-large</th>\n",
       "      <td>1212.459813</td>\n",
       "      <td>4.688072</td>\n",
       "      <td>1216.324375</td>\n",
       "      <td>1208.165403</td>\n",
       "      <td>16672</td>\n",
       "      <td>27.0</td>\n",
       "      <td>2024-09-04</td>\n",
       "      <td>2024-09-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>yi-large-preview</th>\n",
       "      <td>1239.529422</td>\n",
       "      <td>2.157219</td>\n",
       "      <td>1242.573535</td>\n",
       "      <td>1236.674081</td>\n",
       "      <td>51715</td>\n",
       "      <td>24.0</td>\n",
       "      <td>2024-09-04</td>\n",
       "      <td>2024-09-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zephyr-7b-alpha</th>\n",
       "      <td>1041.270032</td>\n",
       "      <td>52.898264</td>\n",
       "      <td>1055.313443</td>\n",
       "      <td>1026.192487</td>\n",
       "      <td>1815</td>\n",
       "      <td>105.0</td>\n",
       "      <td>2024-09-04</td>\n",
       "      <td>2024-09-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zephyr-7b-beta</th>\n",
       "      <td>1053.379519</td>\n",
       "      <td>9.253514</td>\n",
       "      <td>1058.349443</td>\n",
       "      <td>1047.663309</td>\n",
       "      <td>11318</td>\n",
       "      <td>105.0</td>\n",
       "      <td>2024-09-04</td>\n",
       "      <td>2024-09-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zephyr-orpo-141b-A35b-v0.1</th>\n",
       "      <td>1127.188325</td>\n",
       "      <td>15.889332</td>\n",
       "      <td>1134.560465</td>\n",
       "      <td>1119.035415</td>\n",
       "      <td>4858</td>\n",
       "      <td>66.0</td>\n",
       "      <td>2024-09-04</td>\n",
       "      <td>2024-09-04</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>14305 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            rating   variance  rating_q975  \\\n",
       "2024-07-06 RWKV-4-Raven-14B             921.404758  19.994783   931.690090   \n",
       "           alpaca-13b                   901.016508  21.937902   909.301810   \n",
       "           bard-jan-24-gemini-pro      1208.468423   9.904404  1213.746117   \n",
       "           chatglm-6b                   878.863152  25.038000   889.146926   \n",
       "           chatglm2-6b                  923.934417  27.389251   933.436305   \n",
       "...                                            ...        ...          ...   \n",
       "2024-09-04 yi-large                    1212.459813   4.688072  1216.324375   \n",
       "           yi-large-preview            1239.529422   2.157219  1242.573535   \n",
       "           zephyr-7b-alpha             1041.270032  52.898264  1055.313443   \n",
       "           zephyr-7b-beta              1053.379519   9.253514  1058.349443   \n",
       "           zephyr-orpo-141b-A35b-v0.1  1127.188325  15.889332  1134.560465   \n",
       "\n",
       "                                       rating_q025  num_battles  \\\n",
       "2024-07-06 RWKV-4-Raven-14B             913.823607         4943   \n",
       "           alpaca-13b                   891.855328         5876   \n",
       "           bard-jan-24-gemini-pro      1201.453472        11830   \n",
       "           chatglm-6b                   870.238677         4998   \n",
       "           chatglm2-6b                  914.736414         2709   \n",
       "...                                            ...          ...   \n",
       "2024-09-04 yi-large                    1208.165403        16672   \n",
       "           yi-large-preview            1236.674081        51715   \n",
       "           zephyr-7b-alpha             1026.192487         1815   \n",
       "           zephyr-7b-beta              1047.663309        11318   \n",
       "           zephyr-orpo-141b-A35b-v0.1  1119.035415         4858   \n",
       "\n",
       "                                       final_ranking        date   datetime  \n",
       "2024-07-06 RWKV-4-Raven-14B                    105.0  2024-07-06 2024-07-06  \n",
       "           alpaca-13b                          109.0  2024-07-06 2024-07-06  \n",
       "           bard-jan-24-gemini-pro               12.0  2024-07-06 2024-07-06  \n",
       "           chatglm-6b                          110.0  2024-07-06 2024-07-06  \n",
       "           chatglm2-6b                         105.0  2024-07-06 2024-07-06  \n",
       "...                                              ...         ...        ...  \n",
       "2024-09-04 yi-large                             27.0  2024-09-04 2024-09-04  \n",
       "           yi-large-preview                     24.0  2024-09-04 2024-09-04  \n",
       "           zephyr-7b-alpha                     105.0  2024-09-04 2024-09-04  \n",
       "           zephyr-7b-beta                      105.0  2024-09-04 2024-09-04  \n",
       "           zephyr-orpo-141b-A35b-v0.1           66.0  2024-09-04 2024-09-04  \n",
       "\n",
       "[14305 rows x 8 columns]"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create a dictionary to store results for each date\n",
    "from datetime import datetime\n",
    "all_results = {}\n",
    "\n",
    "cutoff_date = datetime.strptime('2023-12-20', '%Y-%m-%d')\n",
    "\n",
    "# Go through each file path in date_df\n",
    "for _, row in date_df.iterrows():\n",
    "    file_path = row['file_path']\n",
    "    date = row['formatted_date']\n",
    "    datetime_temp = pd.to_datetime(date)\n",
    "    \n",
    "    # Check if the date is after Dec 7th, 2023\n",
    "    current_date = datetime.strptime(date, '%Y-%m-%d')\n",
    "    if current_date > cutoff_date:\n",
    "        # Load the pickle file\n",
    "        with open(file_path, 'rb') as f:\n",
    "            data = pickle.load(f)\n",
    "        \n",
    "        try:\n",
    "            leaderboard_df = data['text']['full']['leaderboard_table_df']\n",
    "        except:\n",
    "            try:\n",
    "                leaderboard_df = data['full']['leaderboard_table_df']\n",
    "            except:\n",
    "                try:\n",
    "                    leaderboard_df = data['leaderboard_table_df'] \n",
    "                except KeyError as e:\n",
    "                    raise KeyError(f\"Key 'leaderboard_table_df' not found in the data structure: {e}\")\n",
    "\n",
    "        leaderboard_df['date'] = date\n",
    "        leaderboard_df['datetime'] = datetime_temp\n",
    "        \n",
    "        all_results[date] = leaderboard_df\n",
    "        print(f\"Added results from {date}\")\n",
    "\n",
    "\n",
    "merged_results_df = pd.concat(all_results.values(), keys=all_results.keys())\n",
    "print(f\"Created merged results dataframe with {len(merged_results_df)} rows for dates after {cutoff_date}\")\n",
    "\n",
    "merged_results_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>model_name</th>\n",
       "      <th>rating</th>\n",
       "      <th>variance</th>\n",
       "      <th>rating_q975</th>\n",
       "      <th>rating_q025</th>\n",
       "      <th>num_battles</th>\n",
       "      <th>final_ranking</th>\n",
       "      <th>date</th>\n",
       "      <th>datetime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>RWKV-4-Raven-14B</td>\n",
       "      <td>921.404758</td>\n",
       "      <td>19.994783</td>\n",
       "      <td>931.690090</td>\n",
       "      <td>913.823607</td>\n",
       "      <td>4943</td>\n",
       "      <td>105.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>alpaca-13b</td>\n",
       "      <td>901.016508</td>\n",
       "      <td>21.937902</td>\n",
       "      <td>909.301810</td>\n",
       "      <td>891.855328</td>\n",
       "      <td>5876</td>\n",
       "      <td>109.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>bard-jan-24-gemini-pro</td>\n",
       "      <td>1208.468423</td>\n",
       "      <td>9.904404</td>\n",
       "      <td>1213.746117</td>\n",
       "      <td>1201.453472</td>\n",
       "      <td>11830</td>\n",
       "      <td>12.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>chatglm-6b</td>\n",
       "      <td>878.863152</td>\n",
       "      <td>25.038000</td>\n",
       "      <td>889.146926</td>\n",
       "      <td>870.238677</td>\n",
       "      <td>4998</td>\n",
       "      <td>110.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>chatglm2-6b</td>\n",
       "      <td>923.934417</td>\n",
       "      <td>27.389251</td>\n",
       "      <td>933.436305</td>\n",
       "      <td>914.736414</td>\n",
       "      <td>2709</td>\n",
       "      <td>105.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14300</th>\n",
       "      <td>yi-large</td>\n",
       "      <td>1212.459813</td>\n",
       "      <td>4.688072</td>\n",
       "      <td>1216.324375</td>\n",
       "      <td>1208.165403</td>\n",
       "      <td>16672</td>\n",
       "      <td>27.0</td>\n",
       "      <td>2024-09-04</td>\n",
       "      <td>2024-09-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14301</th>\n",
       "      <td>yi-large-preview</td>\n",
       "      <td>1239.529422</td>\n",
       "      <td>2.157219</td>\n",
       "      <td>1242.573535</td>\n",
       "      <td>1236.674081</td>\n",
       "      <td>51715</td>\n",
       "      <td>24.0</td>\n",
       "      <td>2024-09-04</td>\n",
       "      <td>2024-09-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14302</th>\n",
       "      <td>zephyr-7b-alpha</td>\n",
       "      <td>1041.270032</td>\n",
       "      <td>52.898264</td>\n",
       "      <td>1055.313443</td>\n",
       "      <td>1026.192487</td>\n",
       "      <td>1815</td>\n",
       "      <td>105.0</td>\n",
       "      <td>2024-09-04</td>\n",
       "      <td>2024-09-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14303</th>\n",
       "      <td>zephyr-7b-beta</td>\n",
       "      <td>1053.379519</td>\n",
       "      <td>9.253514</td>\n",
       "      <td>1058.349443</td>\n",
       "      <td>1047.663309</td>\n",
       "      <td>11318</td>\n",
       "      <td>105.0</td>\n",
       "      <td>2024-09-04</td>\n",
       "      <td>2024-09-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14304</th>\n",
       "      <td>zephyr-orpo-141b-A35b-v0.1</td>\n",
       "      <td>1127.188325</td>\n",
       "      <td>15.889332</td>\n",
       "      <td>1134.560465</td>\n",
       "      <td>1119.035415</td>\n",
       "      <td>4858</td>\n",
       "      <td>66.0</td>\n",
       "      <td>2024-09-04</td>\n",
       "      <td>2024-09-04</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>14305 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       model_name       rating   variance  rating_q975  \\\n",
       "0                RWKV-4-Raven-14B   921.404758  19.994783   931.690090   \n",
       "1                      alpaca-13b   901.016508  21.937902   909.301810   \n",
       "2          bard-jan-24-gemini-pro  1208.468423   9.904404  1213.746117   \n",
       "3                      chatglm-6b   878.863152  25.038000   889.146926   \n",
       "4                     chatglm2-6b   923.934417  27.389251   933.436305   \n",
       "...                           ...          ...        ...          ...   \n",
       "14300                    yi-large  1212.459813   4.688072  1216.324375   \n",
       "14301            yi-large-preview  1239.529422   2.157219  1242.573535   \n",
       "14302             zephyr-7b-alpha  1041.270032  52.898264  1055.313443   \n",
       "14303              zephyr-7b-beta  1053.379519   9.253514  1058.349443   \n",
       "14304  zephyr-orpo-141b-A35b-v0.1  1127.188325  15.889332  1134.560465   \n",
       "\n",
       "       rating_q025  num_battles  final_ranking        date   datetime  \n",
       "0       913.823607         4943          105.0  2024-07-06 2024-07-06  \n",
       "1       891.855328         5876          109.0  2024-07-06 2024-07-06  \n",
       "2      1201.453472        11830           12.0  2024-07-06 2024-07-06  \n",
       "3       870.238677         4998          110.0  2024-07-06 2024-07-06  \n",
       "4       914.736414         2709          105.0  2024-07-06 2024-07-06  \n",
       "...            ...          ...            ...         ...        ...  \n",
       "14300  1208.165403        16672           27.0  2024-09-04 2024-09-04  \n",
       "14301  1236.674081        51715           24.0  2024-09-04 2024-09-04  \n",
       "14302  1026.192487         1815          105.0  2024-09-04 2024-09-04  \n",
       "14303  1047.663309        11318          105.0  2024-09-04 2024-09-04  \n",
       "14304  1119.035415         4858           66.0  2024-09-04 2024-09-04  \n",
       "\n",
       "[14305 rows x 9 columns]"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# merged_results_df\n",
    "merged_results_df = merged_results_df.reset_index()\n",
    "merged_results_df = merged_results_df.rename(columns={'level_1': 'model_name'}).drop(columns=['level_0'])\n",
    "merged_results_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### sanity check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Date range in the merged results: 2024-01-09 to 2025-04-23\n",
      "Number of unique dates in the dataset: 99\n"
     ]
    }
   ],
   "source": [
    "min_date = merged_results_df['date'].min()\n",
    "max_date = merged_results_df['date'].max()\n",
    "print(f\"Date range in the merged results: {min_date} to {max_date}\")\n",
    "\n",
    "# Count the number of unique dates\n",
    "unique_dates = merged_results_df['date'].nunique()\n",
    "print(f\"Number of unique dates in the dataset: {unique_dates}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "243"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged_results_df['model_name'].unique().size"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### map license and provider using the leaderboard table csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "## get model providers mapping\n",
    "\n",
    "# get a latest leaderboard table to get provider and license information\n",
    "leaderboard_table = pd.read_csv('leaderboard_results/leaderboard_table_20250423.csv')\n",
    "\n",
    "model_providers_map = {row['key']: row['Organization'] for _, row in leaderboard_table.iterrows()}\n",
    "model_license_map = {row['key']: row['License'] for _, row in leaderboard_table.iterrows()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'athene-70b',\n",
       " 'chatgpt-4o-latest-2024-08-08',\n",
       " 'dbrx-instruct',\n",
       " 'gemini-2.0-flash-lite-preview',\n",
       " 'gpt-4-turbo',\n",
       " 'im-also-a-late-june-chatbot',\n",
       " 'late-june-chatbot',\n",
       " 'llama-3.1-nemotron-ultra-253b-v1',\n",
       " 'qwen-max-2025-01-25'}"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(merged_results_df['model_name'].to_list()) - set(leaderboard_table['key'].to_list())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'athene-70b',\n",
       " 'chatgpt-4o-latest-2024-08-08',\n",
       " 'dbrx-instruct',\n",
       " 'gemini-2.0-flash-lite-preview',\n",
       " 'gemini-2.5-flash-preview-04-17',\n",
       " 'gpt-4-turbo',\n",
       " 'gpt-4.1-2025-04-14',\n",
       " 'gpt-4.1-mini-2025-04-14',\n",
       " 'gpt-4.1-nano-2025-04-14',\n",
       " 'im-also-a-late-june-chatbot',\n",
       " 'late-june-chatbot',\n",
       " 'llama-3.1-nemotron-ultra-253b-v1',\n",
       " 'llama-4-maverick-03-26-experimental',\n",
       " 'llama-4-maverick-17b-128e-instruct',\n",
       " 'o3-2025-04-16',\n",
       " 'o4-mini-2025-04-16',\n",
       " 'qwen-max-2025-01-25'}"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(merged_results_df['model_name'].to_list()) - set(pd.read_csv('leaderboard_results/leaderboard_table_20250330.csv')['key'].to_list())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'baize-v2-13b',\n",
       " 'c4ai-aya-vision-32b',\n",
       " 'c4ai-aya-vision-8b',\n",
       " 'cogvlm2-llama3-chat-19b',\n",
       " 'dall-e-3',\n",
       " 'falcon-40b-instruct',\n",
       " 'flux-1-dev-fp8',\n",
       " 'flux-1.1-pro',\n",
       " 'glm-4-air',\n",
       " 'guanaco-65b',\n",
       " 'h2o-oasst-openllama-13b',\n",
       " 'hunyuan-standard-vision-2024-12-31',\n",
       " 'ideogram-v2',\n",
       " 'imagen-3.0-generate-002',\n",
       " 'internvl2-26b',\n",
       " 'internvl2-4b',\n",
       " 'llama-3.2-vision-11b-instruct',\n",
       " 'llama-3.2-vision-90b-instruct',\n",
       " 'llava-onevision-qwen2-72b-ov',\n",
       " 'llava-v1.6-34b',\n",
       " 'minicpm-v-2_6',\n",
       " 'molmo-72b-0924',\n",
       " 'molmo-7b-d-0924',\n",
       " 'mpt-30b-instruct',\n",
       " 'nous-hermes-13b',\n",
       " 'nvila-internal-15b-v1',\n",
       " 'openassistant-llama-30b',\n",
       " 'phi-3-vision-128k-instruct',\n",
       " 'phi-3.5-vision-instruct',\n",
       " 'photon',\n",
       " 'pixtral-12b-2409',\n",
       " 'pixtral-large-2411',\n",
       " 'qwen-vl-max-0809',\n",
       " 'qwen-vl-max-1119',\n",
       " 'qwen2-vl-72b',\n",
       " 'qwen2-vl-7b-instruct',\n",
       " 'qwen2.5-vl-32b-instruct',\n",
       " 'qwen2.5-vl-72b-instruct',\n",
       " 'recraft-v3',\n",
       " 'snorkel-mistral-pairrm-dpo',\n",
       " 'stable-diffusion-v35-large',\n",
       " 'step-1o-vision-32k-highres',\n",
       " 'step-1v-32k',\n",
       " 'tulu-30b',\n",
       " 'vicuna-13b-16k',\n",
       " 'vicuna-7b-16k',\n",
       " 'wizardlm-13b-v1.0',\n",
       " 'wizardlm-13b-v1.1',\n",
       " 'wizardlm-30b',\n",
       " 'xgen-7b-8k-inst',\n",
       " 'yi-vision'}"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# all vision models\n",
    "set(leaderboard_table['key'].to_list()) - set(merged_results_df['model_name'].to_list())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### handle outlier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_providers_map['athene-70b'] = \"NexusFlow\"\n",
    "model_providers_map['chatgpt-4o-latest-2024-08-08'] = \"OpenAI\"\n",
    "model_providers_map['dbrx-instruct'] = \"Databricks\"\n",
    "model_providers_map['gemini-2.0-flash-lite-preview'] = \"Google\"\n",
    "model_providers_map['gpt-4-turbo'] = \"OpenAI\"\n",
    "model_providers_map['im-also-a-late-june-chatbot'] = \"Unknown\"\n",
    "model_providers_map['late-june-chatbot'] = \"Unknown\"\n",
    "model_providers_map['qwen-max-2025-01-25'] = \"Alibaba\"\n",
    "model_providers_map['llama-3.1-nemotron-ultra-253b-v1'] = 'Nvidia'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>model_name</th>\n",
       "      <th>rating</th>\n",
       "      <th>variance</th>\n",
       "      <th>rating_q975</th>\n",
       "      <th>rating_q025</th>\n",
       "      <th>num_battles</th>\n",
       "      <th>final_ranking</th>\n",
       "      <th>date</th>\n",
       "      <th>datetime</th>\n",
       "      <th>provider</th>\n",
       "      <th>license</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>RWKV-4-Raven-14B</td>\n",
       "      <td>921.404758</td>\n",
       "      <td>19.994783</td>\n",
       "      <td>931.69009</td>\n",
       "      <td>913.823607</td>\n",
       "      <td>4943</td>\n",
       "      <td>105.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>RWKV</td>\n",
       "      <td>Apache 2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>alpaca-13b</td>\n",
       "      <td>901.016508</td>\n",
       "      <td>21.937902</td>\n",
       "      <td>909.30181</td>\n",
       "      <td>891.855328</td>\n",
       "      <td>5876</td>\n",
       "      <td>109.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>Stanford</td>\n",
       "      <td>Non-commercial</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         model_name      rating   variance  rating_q975  rating_q025  \\\n",
       "0  RWKV-4-Raven-14B  921.404758  19.994783    931.69009   913.823607   \n",
       "1        alpaca-13b  901.016508  21.937902    909.30181   891.855328   \n",
       "\n",
       "   num_battles  final_ranking        date   datetime  provider         license  \n",
       "0         4943          105.0  2024-07-06 2024-07-06      RWKV      Apache 2.0  \n",
       "1         5876          109.0  2024-07-06 2024-07-06  Stanford  Non-commercial  "
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged_results_df['provider'] = merged_results_df['model_name'].map(model_providers_map)\n",
    "merged_results_df['license'] = merged_results_df['model_name'].map(model_license_map)\n",
    "\n",
    "merged_results_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_results_df.loc[merged_results_df['provider'] == 'Ai2', 'provider'] = 'Allen AI'\n",
    "merged_results_df.loc[merged_results_df['provider'] == 'AllenAI/UW', 'provider'] = 'Allen AI'\n",
    "merged_results_df.loc[merged_results_df['provider'] == 'DeepSeek', 'provider'] = 'DeepSeek AI'\n",
    "merged_results_df.loc[merged_results_df['provider'] == 'Nexusflow', 'provider'] = 'NexusFlow'\n",
    "merged_results_df.loc[merged_results_df['provider'] == 'Zhipu', 'provider'] = 'Zhipu AI'\n",
    "merged_results_df.loc[merged_results_df['provider'] == 'UW', 'provider'] = 'Allen AI'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "license_categories = {\n",
    "    # Fully Open Source licenses\n",
    "    'Fully Open Source': [\n",
    "        'Apache 2.0',\n",
    "        'Apache-2.0',\n",
    "        'MIT',\n",
    "        'CC-BY-SA 3.0',\n",
    "        'Open'\n",
    "    ],\n",
    "    \n",
    "    # Open Weights licenses (weights available but with restrictions)\n",
    "    'Open Weights': [\n",
    "        'AI2 ImpACT Low-risk',\n",
    "        'CC-BY-NC-4.0',\n",
    "        'CC-BY-NC-SA-4.0',\n",
    "        'CogVLM2',\n",
    "        'DBRX LICENSE',\n",
    "        'DeepSeek',\n",
    "        'DeepSeek License',\n",
    "        'Falcon-180B TII License',\n",
    "        'Gemma',\n",
    "        'Gemma license',\n",
    "        'Jamba Open',\n",
    "        'Llama',\n",
    "        'Llama 2 Community',\n",
    "        'Llama 3 Community',\n",
    "        'Llama 3.1',\n",
    "        'Llama 3.1 Community',\n",
    "        'Llama 3.2',\n",
    "        'Llama-3.3',\n",
    "        'MRL',\n",
    "        'Mistral Research',\n",
    "        'NVIDIA Open Model',\n",
    "        'NexusFlow',\n",
    "        'Non-commercial',\n",
    "        'Nvidia',\n",
    "        'Qianwen LICENSE',\n",
    "        'Qwen',\n",
    "        'Yi License',\n",
    "        'Llama 4'\n",
    "    ],\n",
    "    \n",
    "    # Proprietary licenses\n",
    "    'Proprietary': [\n",
    "        '-',\n",
    "        'Propretary', #spelling mistake in original data\n",
    "        'Proprietary',\n",
    "        'Other'\n",
    "    ]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>model_name</th>\n",
       "      <th>rating</th>\n",
       "      <th>variance</th>\n",
       "      <th>rating_q975</th>\n",
       "      <th>rating_q025</th>\n",
       "      <th>num_battles</th>\n",
       "      <th>final_ranking</th>\n",
       "      <th>date</th>\n",
       "      <th>datetime</th>\n",
       "      <th>provider</th>\n",
       "      <th>license</th>\n",
       "      <th>license_category</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>RWKV-4-Raven-14B</td>\n",
       "      <td>921.404758</td>\n",
       "      <td>19.994783</td>\n",
       "      <td>931.690090</td>\n",
       "      <td>913.823607</td>\n",
       "      <td>4943</td>\n",
       "      <td>105.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>RWKV</td>\n",
       "      <td>Apache 2.0</td>\n",
       "      <td>Fully Open Source</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>alpaca-13b</td>\n",
       "      <td>901.016508</td>\n",
       "      <td>21.937902</td>\n",
       "      <td>909.301810</td>\n",
       "      <td>891.855328</td>\n",
       "      <td>5876</td>\n",
       "      <td>109.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>Stanford</td>\n",
       "      <td>Non-commercial</td>\n",
       "      <td>Open Weights</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>bard-jan-24-gemini-pro</td>\n",
       "      <td>1208.468423</td>\n",
       "      <td>9.904404</td>\n",
       "      <td>1213.746117</td>\n",
       "      <td>1201.453472</td>\n",
       "      <td>11830</td>\n",
       "      <td>12.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>Google</td>\n",
       "      <td>Proprietary</td>\n",
       "      <td>Proprietary</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>chatglm-6b</td>\n",
       "      <td>878.863152</td>\n",
       "      <td>25.038000</td>\n",
       "      <td>889.146926</td>\n",
       "      <td>870.238677</td>\n",
       "      <td>4998</td>\n",
       "      <td>110.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>Tsinghua</td>\n",
       "      <td>Non-commercial</td>\n",
       "      <td>Open Weights</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>chatglm2-6b</td>\n",
       "      <td>923.934417</td>\n",
       "      <td>27.389251</td>\n",
       "      <td>933.436305</td>\n",
       "      <td>914.736414</td>\n",
       "      <td>2709</td>\n",
       "      <td>105.0</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>2024-07-06</td>\n",
       "      <td>Tsinghua</td>\n",
       "      <td>Apache-2.0</td>\n",
       "      <td>Fully Open Source</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               model_name       rating   variance  rating_q975  rating_q025  \\\n",
       "0        RWKV-4-Raven-14B   921.404758  19.994783   931.690090   913.823607   \n",
       "1              alpaca-13b   901.016508  21.937902   909.301810   891.855328   \n",
       "2  bard-jan-24-gemini-pro  1208.468423   9.904404  1213.746117  1201.453472   \n",
       "3              chatglm-6b   878.863152  25.038000   889.146926   870.238677   \n",
       "4             chatglm2-6b   923.934417  27.389251   933.436305   914.736414   \n",
       "\n",
       "   num_battles  final_ranking        date   datetime  provider  \\\n",
       "0         4943          105.0  2024-07-06 2024-07-06      RWKV   \n",
       "1         5876          109.0  2024-07-06 2024-07-06  Stanford   \n",
       "2        11830           12.0  2024-07-06 2024-07-06    Google   \n",
       "3         4998          110.0  2024-07-06 2024-07-06  Tsinghua   \n",
       "4         2709          105.0  2024-07-06 2024-07-06  Tsinghua   \n",
       "\n",
       "          license   license_category  \n",
       "0      Apache 2.0  Fully Open Source  \n",
       "1  Non-commercial       Open Weights  \n",
       "2     Proprietary        Proprietary  \n",
       "3  Non-commercial       Open Weights  \n",
       "4      Apache-2.0  Fully Open Source  "
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def assign_license_category(license):\n",
    "    if license in license_categories['Fully Open Source']:\n",
    "        return 'Fully Open Source'\n",
    "    elif license in license_categories['Open Weights']:\n",
    "        return 'Open Weights'\n",
    "    else:\n",
    "        return 'Proprietary'\n",
    "\n",
    "merged_results_df['license_category'] = merged_results_df['license'].apply(assign_license_category)\n",
    "merged_results_df.loc[merged_results_df['model_name'] == 'llama-4-maverick-03-26-experimental', 'license_category'] = 'Open Weights'\n",
    "merged_results_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### save to csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>model_name</th>\n",
       "      <th>rating</th>\n",
       "      <th>variance</th>\n",
       "      <th>rating_q975</th>\n",
       "      <th>rating_q025</th>\n",
       "      <th>num_battles</th>\n",
       "      <th>final_ranking</th>\n",
       "      <th>date</th>\n",
       "      <th>datetime</th>\n",
       "      <th>provider</th>\n",
       "      <th>license</th>\n",
       "      <th>license_category</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [model_name, rating, variance, rating_q975, rating_q025, num_battles, final_ranking, date, datetime, provider, license, license_category]\n",
       "Index: []"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged_results_df[merged_results_df['provider'].isna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>model_name</th>\n",
       "      <th>rating</th>\n",
       "      <th>variance</th>\n",
       "      <th>rating_q975</th>\n",
       "      <th>rating_q025</th>\n",
       "      <th>num_battles</th>\n",
       "      <th>final_ranking</th>\n",
       "      <th>date</th>\n",
       "      <th>datetime</th>\n",
       "      <th>provider</th>\n",
       "      <th>license</th>\n",
       "      <th>license_category</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [model_name, rating, variance, rating_q975, rating_q025, num_battles, final_ranking, date, datetime, provider, license, license_category]\n",
       "Index: []"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged_results_df[merged_results_df['license_category'].isna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the merged results to a CSV file\n",
    "\n",
    "merged_results_df.to_csv('merged_hf_leaderboard_history.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "elo",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
