{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "import statcontracts.optimize\n",
    "from notebook_env import *\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "f_path = f'{datasets_directory}/gpt-4_single.jsonl'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question_id</th>\n",
       "      <th>model</th>\n",
       "      <th>judge</th>\n",
       "      <th>user_prompt</th>\n",
       "      <th>judgment</th>\n",
       "      <th>score</th>\n",
       "      <th>turn</th>\n",
       "      <th>tstamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>81</td>\n",
       "      <td>alpaca-13b</td>\n",
       "      <td>[gpt-4, single-v1]</td>\n",
       "      <td>[Instruction]\\nPlease act as an impartial judg...</td>\n",
       "      <td>The assistant's response is relevant and accur...</td>\n",
       "      <td>7.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.687222e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>81</td>\n",
       "      <td>alpaca-13b</td>\n",
       "      <td>[gpt-4, single-v1-multi-turn]</td>\n",
       "      <td>&lt;|The Start of Assistant A's Conversation with...</td>\n",
       "      <td>The assistant failed to follow the user's inst...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.687222e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>82</td>\n",
       "      <td>alpaca-13b</td>\n",
       "      <td>[gpt-4, single-v1]</td>\n",
       "      <td>[Instruction]\\nPlease act as an impartial judg...</td>\n",
       "      <td>The assistant's response is concise, professio...</td>\n",
       "      <td>8.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.687224e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>82</td>\n",
       "      <td>alpaca-13b</td>\n",
       "      <td>[gpt-4, single-v1-multi-turn]</td>\n",
       "      <td>&lt;|The Start of Assistant A's Conversation with...</td>\n",
       "      <td>The AI assistant's self-evaluation is accurate...</td>\n",
       "      <td>7.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.687223e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>83</td>\n",
       "      <td>alpaca-13b</td>\n",
       "      <td>[gpt-4, single-v1]</td>\n",
       "      <td>[Instruction]\\nPlease act as an impartial judg...</td>\n",
       "      <td>The assistant's response is very relevant and ...</td>\n",
       "      <td>10.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.687222e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5435</th>\n",
       "      <td>134</td>\n",
       "      <td>Llama-2-70b-chat</td>\n",
       "      <td>[gpt-4, single-v1-multi-turn]</td>\n",
       "      <td>&lt;|The Start of Assistant A's Conversation with...</td>\n",
       "      <td>The assistant's response is accurate, relevant...</td>\n",
       "      <td>10.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.689738e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5436</th>\n",
       "      <td>90</td>\n",
       "      <td>Llama-2-70b-chat</td>\n",
       "      <td>[gpt-4, single-v1]</td>\n",
       "      <td>[Instruction]\\nPlease act as an impartial judg...</td>\n",
       "      <td>The assistant's response is accurate, relevant...</td>\n",
       "      <td>8.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.689738e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5437</th>\n",
       "      <td>158</td>\n",
       "      <td>Llama-2-70b-chat</td>\n",
       "      <td>[gpt-4, single-v1]</td>\n",
       "      <td>[Instruction]\\nPlease act as an impartial judg...</td>\n",
       "      <td>The assistant's response is highly informative...</td>\n",
       "      <td>10.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.689738e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5438</th>\n",
       "      <td>118</td>\n",
       "      <td>Llama-2-70b-chat</td>\n",
       "      <td>[gpt-4, single-math-v1]</td>\n",
       "      <td>[Instruction]\\nPlease act as an impartial judg...</td>\n",
       "      <td>The assistant's response is incorrect. The ass...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.689738e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5439</th>\n",
       "      <td>148</td>\n",
       "      <td>Llama-2-70b-chat</td>\n",
       "      <td>[gpt-4, single-v1-multi-turn]</td>\n",
       "      <td>&lt;|The Start of Assistant A's Conversation with...</td>\n",
       "      <td>The assistant's response is highly detailed, a...</td>\n",
       "      <td>9.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.689738e+09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5440 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      question_id             model                          judge  \\\n",
       "0              81        alpaca-13b             [gpt-4, single-v1]   \n",
       "1              81        alpaca-13b  [gpt-4, single-v1-multi-turn]   \n",
       "2              82        alpaca-13b             [gpt-4, single-v1]   \n",
       "3              82        alpaca-13b  [gpt-4, single-v1-multi-turn]   \n",
       "4              83        alpaca-13b             [gpt-4, single-v1]   \n",
       "...           ...               ...                            ...   \n",
       "5435          134  Llama-2-70b-chat  [gpt-4, single-v1-multi-turn]   \n",
       "5436           90  Llama-2-70b-chat             [gpt-4, single-v1]   \n",
       "5437          158  Llama-2-70b-chat             [gpt-4, single-v1]   \n",
       "5438          118  Llama-2-70b-chat        [gpt-4, single-math-v1]   \n",
       "5439          148  Llama-2-70b-chat  [gpt-4, single-v1-multi-turn]   \n",
       "\n",
       "                                            user_prompt  \\\n",
       "0     [Instruction]\\nPlease act as an impartial judg...   \n",
       "1     <|The Start of Assistant A's Conversation with...   \n",
       "2     [Instruction]\\nPlease act as an impartial judg...   \n",
       "3     <|The Start of Assistant A's Conversation with...   \n",
       "4     [Instruction]\\nPlease act as an impartial judg...   \n",
       "...                                                 ...   \n",
       "5435  <|The Start of Assistant A's Conversation with...   \n",
       "5436  [Instruction]\\nPlease act as an impartial judg...   \n",
       "5437  [Instruction]\\nPlease act as an impartial judg...   \n",
       "5438  [Instruction]\\nPlease act as an impartial judg...   \n",
       "5439  <|The Start of Assistant A's Conversation with...   \n",
       "\n",
       "                                               judgment  score  turn  \\\n",
       "0     The assistant's response is relevant and accur...    7.0     1   \n",
       "1     The assistant failed to follow the user's inst...    1.0     2   \n",
       "2     The assistant's response is concise, professio...    8.0     1   \n",
       "3     The AI assistant's self-evaluation is accurate...    7.0     2   \n",
       "4     The assistant's response is very relevant and ...   10.0     1   \n",
       "...                                                 ...    ...   ...   \n",
       "5435  The assistant's response is accurate, relevant...   10.0     2   \n",
       "5436  The assistant's response is accurate, relevant...    8.0     1   \n",
       "5437  The assistant's response is highly informative...   10.0     1   \n",
       "5438  The assistant's response is incorrect. The ass...    1.0     1   \n",
       "5439  The assistant's response is highly detailed, a...    9.0     2   \n",
       "\n",
       "            tstamp  \n",
       "0     1.687222e+09  \n",
       "1     1.687222e+09  \n",
       "2     1.687224e+09  \n",
       "3     1.687223e+09  \n",
       "4     1.687222e+09  \n",
       "...            ...  \n",
       "5435  1.689738e+09  \n",
       "5436  1.689738e+09  \n",
       "5437  1.689738e+09  \n",
       "5438  1.689738e+09  \n",
       "5439  1.689738e+09  \n",
       "\n",
       "[5440 rows x 8 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df = pd.read_json(f_path, lines = True)\n",
    "raw_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "# function that takes the raw paragraph and returns only string in between \"[The Start of Assistant's Answer]\" and \"[The End of Assistant's Answer]\"\n",
    "\n",
    "def split_response(paragraph):\n",
    "    start = \"[The Start of Assistant's Answer]\"\n",
    "    end = \"[The End of Assistant's Answer]\"\n",
    "    start_idx = paragraph.index(start) + len(start)\n",
    "    end_idx = paragraph.index(end)\n",
    "    return paragraph[start_idx:end_idx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model_predicate = lambda st: any(x == st for x in [f'Llama-2-{i}b-chat' for i in ['7','13','70']])\n",
    "df = (\n",
    "    raw_df\n",
    "    .query('model.map(@model_predicate)')\n",
    "    .assign(prompt_type = lambda df: df['judge'].apply(lambda x: x[-1]))\n",
    "    .query('prompt_type == \"single-v1\"')\n",
    "    .groupby(['model','user_prompt'],as_index=False)['tstamp']\n",
    "    .count()\n",
    "    #.to_frame()\n",
    "    .assign(mdl_response = lambda df: df['user_prompt'].apply(lambda x: split_response(x)))\n",
    "    .groupby('model')['mdl_response']\n",
    "    .apply(lambda x: np.mean(x.str.len()))\n",
    "\n",
    "\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "model\n",
       "Llama-2-13b-chat    1573.02\n",
       "Llama-2-70b-chat    1625.02\n",
       "Llama-2-7b-chat     1695.92\n",
       "Name: mdl_response, dtype: float64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
