{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f15440d9-bef9-4983-8fa3-90c91a16dfdd",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from datasets import load_dataset\n",
    "from tqdm.auto import tqdm\n",
    "import numpy as np\n",
    "import pickle\n",
    "import json\n",
    "import os\n",
    "from constants import *\n",
    "from utils_download import *\n",
    "\n",
    "username_token = ''\n",
    "date = '20241205'\n",
    "lb = 'new'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6f1028ca-176d-442a-9d60-062026ba8fe9",
   "metadata": {},
   "outputs": [],
   "source": [
    "#outputs = []\n",
    "#for model in tqdm(models):\n",
    "#    outputs.append(clone_repo(f'https://{username_token}@huggingface.co/datasets/{model}/', target_directory=f'new_repos/{model}'.replace(\"open-llm-leaderboard/\",\"\")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c79e37a2-7331-454e-a74f-9f97312b7843",
   "metadata": {},
   "outputs": [],
   "source": [
    "models = MODELS\n",
    "scenarios = SCENARIOS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1d45a714-8942-4b75-9a65-2761a737445f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1516accea706483e8f98b1895441ea5b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/20 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "data = {}\n",
    "for model in tqdm(models):\n",
    "    data[model] = {}\n",
    "    for s in scenarios:\n",
    "        scenario = s.split('|')[0]\n",
    "        data[model][scenario] = {}\n",
    "        \n",
    "        base_dir = \"new_repos/\"+model.replace(\"open-llm-leaderboard/\",\"\")+\"/\"+model.replace(\"open-llm-leaderboard/\",\"\").replace(\"-details\",\"\")+\"/\"\n",
    "        \n",
    "        if os.path.isdir(base_dir):\n",
    "            file = find_folder_with_file_new(s, base_dir) \n",
    "        else:\n",
    "            file = None\n",
    "        \n",
    "        if file is not None: \n",
    "            file = find_folder_with_file_new(s, base_dir)\n",
    "            with open(base_dir+file, 'r') as json_file:\n",
    "                aux = list(json_file)\n",
    "                \n",
    "            if 'ifeval' in scenario:\n",
    "                aux1 = np.array([np.mean(json.loads(a)['inst_level_strict_acc']) for a in aux])\n",
    "                weight1 = np.array([len(json.loads(a)['inst_level_strict_acc']) for a in aux]).astype(float)\n",
    "                weight1 /= weight1.mean()\n",
    "                aux2 = np.array([np.mean(json.loads(a)['prompt_level_strict_acc']) for a in aux])\n",
    "                data[model][scenario]['correctness'] = {'correctness_inst':aux1, 'weight_inst': weight1,\n",
    "                                                        'correctness_prompt':aux2, 'weight_prompt': np.ones(len(aux2)).astype(float)}\n",
    "            else:\n",
    "                if 'mmlu_pro' in scenario: metric = 'acc'\n",
    "                elif 'math' in scenario: metric = 'exact_match'\n",
    "                else: metric = 'acc_norm'\n",
    "                data[model][scenario]['correctness'] = np.array([json.loads(a)[metric] for a in aux])\n",
    "\n",
    "        else:\n",
    "            data[model][scenario]['dates'] = None\n",
    "            data[model][scenario]['correctness'] = None\n",
    "\n",
    "with open(f'data/new_leaderboard_raw_{date}.pickle', 'wb') as handle:\n",
    "    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4035711a-2ebc-4d78-858f-38d1c42572b0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cc312449f25e43ebbe8dd8527a8b30e1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/39 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "def conversion(s):\n",
    "    if isinstance(s, (int, float)):\n",
    "        return s\n",
    "    else:\n",
    "        try:\n",
    "            return float(s)\n",
    "        except:\n",
    "            return {'1':1,'0':0,'1.0':1,'0.0':0,'True':1,'False':0, True:1, False:0}[s]\n",
    "\n",
    "with open(f'data/{lb}_leaderboard_raw_{date}.pickle', 'rb') as handle:\n",
    "    data = pickle.load(handle)\n",
    "\n",
    "pdata = {}\n",
    "for s in tqdm(np.unique(sum([list(data[key].keys()) for key in data.keys()], [])).tolist()):\n",
    "    if 'ifeval' in s:\n",
    "        pdata[s] = {}\n",
    "        pdata[s]['correctness'] = {}\n",
    "\n",
    "        for correct in ['inst', 'prompt']:\n",
    "            ###\n",
    "            aux_data = [data[m][s]['correctness']['correctness_'+correct] for m in data.keys() if data[m][s] is not None and data[m][s]['correctness'] is not None]\n",
    "            aux_data = [[conversion(s) for s in a] for a in  aux_data]\n",
    "            aux_models = [m for m in data.keys() if data[m][s] is not None and data[m][s]['correctness'] is not None]\n",
    "            valid = np.array([np.array(a).shape[0] for a in aux_data])\n",
    "            valid = valid==np.median(valid)\n",
    "            aux_data = [m for i,m in enumerate(aux_data) if valid[i]] \n",
    "            aux_models = [m for i,m in enumerate(aux_models) if valid[i]] \n",
    "            \n",
    "            ###\n",
    "            pdata[s]['correctness']['correctness_'+correct] = np.array(aux_data).astype(float)\n",
    "            pdata[s]['correctness']['weights_'+correct] = data[aux_models[0]][s]['correctness']['weight_'+correct]\n",
    "            pdata[s]['models'] = aux_models\n",
    "    else:\n",
    "        ###\n",
    "        aux_data = [data[m][s]['correctness'] for m in data.keys() if data[m][s] is not None and data[m][s]['correctness'] is not None]\n",
    "        aux_data = [[conversion(s) for s in a] for a in  aux_data]\n",
    "        aux_models = [m for m in data.keys() if data[m][s] is not None and data[m][s]['correctness'] is not None]\n",
    "        valid = np.array([np.array(a).shape[0] for a in aux_data])\n",
    "        valid = valid==np.median(valid)\n",
    "        aux_data = [m for i,m in enumerate(aux_data) if valid[i]] \n",
    "        aux_models = [m for i,m in enumerate(aux_models) if valid[i]] \n",
    "        \n",
    "        ###\n",
    "        pdata[s] = {}\n",
    "        pdata[s]['correctness'] = np.array(aux_data).astype(float)\n",
    "        pdata[s]['models'] = aux_models\n",
    "\n",
    "with open(f'data/{lb}_leaderboard_processed_{date}.pickle', 'wb') as handle:\n",
    "    pickle.dump(pdata, handle, protocol=pickle.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1dab11f-a4b8-4dfc-8ba6-13983ee41b75",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
