{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_dir = '../output'\n",
    "full_chat_dir = os.path.join(output_dir, 'full_chat')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "full_chat_df = pd.read_csv(os.path.join(full_chat_dir, 'google-flan-t5-xxl_predictions.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['id', 'text', 'generated_text', 'seed', 'max_new_tokens', 'do_sample',\n",
       "       'response_time', 'tokens_per_second', 'milliseconds_per_token',\n",
       "       'torch_allocated_GBs', 'total_GPU_memory_used_GBs',\n",
       "       'GPU_memory_used_GBs', 'finish_reason', 'generated_tokens', 'task_id',\n",
       "       'hypothesis', 'dataset', 'model_name_or_path',\n",
       "       'weqweasdas/hh_rlhf_rm_open_llama_3b_eval_text',\n",
       "       'weqweasdas/hh_rlhf_rm_open_llama_3b_eval_reward',\n",
       "       'weqweasdas/hh_rlhf_rm_open_llama_3b_eval_reward_med_sigmoid'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "full_chat_df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>text</th>\n",
       "      <th>generated_text</th>\n",
       "      <th>seed</th>\n",
       "      <th>max_new_tokens</th>\n",
       "      <th>do_sample</th>\n",
       "      <th>response_time</th>\n",
       "      <th>tokens_per_second</th>\n",
       "      <th>milliseconds_per_token</th>\n",
       "      <th>torch_allocated_GBs</th>\n",
       "      <th>...</th>\n",
       "      <th>GPU_memory_used_GBs</th>\n",
       "      <th>finish_reason</th>\n",
       "      <th>generated_tokens</th>\n",
       "      <th>task_id</th>\n",
       "      <th>hypothesis</th>\n",
       "      <th>dataset</th>\n",
       "      <th>model_name_or_path</th>\n",
       "      <th>weqweasdas/hh_rlhf_rm_open_llama_3b_eval_text</th>\n",
       "      <th>weqweasdas/hh_rlhf_rm_open_llama_3b_eval_reward</th>\n",
       "      <th>weqweasdas/hh_rlhf_rm_open_llama_3b_eval_reward_med_sigmoid</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>You are a helpful chatbot. Your purpose is to ...</td>\n",
       "      <td>You can use a lint roller to remove the pills.</td>\n",
       "      <td>42</td>\n",
       "      <td>1268</td>\n",
       "      <td>False</td>\n",
       "      <td>0.984879</td>\n",
       "      <td>16.245643</td>\n",
       "      <td>61.554966</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>{0: 8.126, 1: 8.12, 2: 8.152, 3: 8.164}</td>\n",
       "      <td>eos_token</td>\n",
       "      <td>16</td>\n",
       "      <td>6152</td>\n",
       "      <td>{'instruction': 'You are a helpful chatbot. Yo...</td>\n",
       "      <td>full_chat</td>\n",
       "      <td>google/flan-t5-xxl</td>\n",
       "      <td>###Human: How can I remove pilling from clothe...</td>\n",
       "      <td>5.468750</td>\n",
       "      <td>0.771843</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>You are a helpful chatbot. Your purpose is to ...</td>\n",
       "      <td>I can help you with that. Let me know if there...</td>\n",
       "      <td>42</td>\n",
       "      <td>1268</td>\n",
       "      <td>False</td>\n",
       "      <td>1.486358</td>\n",
       "      <td>16.146855</td>\n",
       "      <td>61.931564</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>{0: 8.156, 1: 8.15, 2: 8.18, 3: 8.196}</td>\n",
       "      <td>eos_token</td>\n",
       "      <td>24</td>\n",
       "      <td>8034</td>\n",
       "      <td>{'instruction': 'You are a helpful chatbot. Yo...</td>\n",
       "      <td>full_chat</td>\n",
       "      <td>google/flan-t5-xxl</td>\n",
       "      <td>###Human: Can you give me instructions on how ...</td>\n",
       "      <td>3.796875</td>\n",
       "      <td>0.388618</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>You are a helpful chatbot. Your purpose is to ...</td>\n",
       "      <td>I can help you with that. What color are you i...</td>\n",
       "      <td>42</td>\n",
       "      <td>1268</td>\n",
       "      <td>False</td>\n",
       "      <td>0.942199</td>\n",
       "      <td>15.920209</td>\n",
       "      <td>62.813247</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>{0: 8.126, 1: 8.12, 2: 8.132, 3: 8.16}</td>\n",
       "      <td>eos_token</td>\n",
       "      <td>15</td>\n",
       "      <td>6917</td>\n",
       "      <td>{'instruction': 'You are a helpful chatbot. Yo...</td>\n",
       "      <td>full_chat</td>\n",
       "      <td>google/flan-t5-xxl</td>\n",
       "      <td>###Human: What colors are happy colors? ###Ass...</td>\n",
       "      <td>3.968750</td>\n",
       "      <td>0.430147</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>You are a helpful chatbot. Your purpose is to ...</td>\n",
       "      <td>I can help you with that. Let me know what you...</td>\n",
       "      <td>42</td>\n",
       "      <td>1268</td>\n",
       "      <td>False</td>\n",
       "      <td>1.304824</td>\n",
       "      <td>14.561350</td>\n",
       "      <td>68.674952</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>{0: 8.156, 1: 8.15, 2: 8.162, 3: 8.164}</td>\n",
       "      <td>eos_token</td>\n",
       "      <td>19</td>\n",
       "      <td>1249</td>\n",
       "      <td>{'instruction': 'You are a helpful chatbot. Yo...</td>\n",
       "      <td>full_chat</td>\n",
       "      <td>google/flan-t5-xxl</td>\n",
       "      <td>###Human: I recently ran into my bully from sc...</td>\n",
       "      <td>4.531250</td>\n",
       "      <td>0.569853</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>You are a helpful chatbot. Your purpose is to ...</td>\n",
       "      <td>I can help you with that. Let me know if there...</td>\n",
       "      <td>42</td>\n",
       "      <td>1268</td>\n",
       "      <td>False</td>\n",
       "      <td>1.482429</td>\n",
       "      <td>16.189644</td>\n",
       "      <td>61.767880</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>{0: 8.156, 1: 8.15, 2: 8.18, 3: 8.196}</td>\n",
       "      <td>eos_token</td>\n",
       "      <td>24</td>\n",
       "      <td>5913</td>\n",
       "      <td>{'instruction': 'You are a helpful chatbot. Yo...</td>\n",
       "      <td>full_chat</td>\n",
       "      <td>google/flan-t5-xxl</td>\n",
       "      <td>###Human: What kinds of video games are okay f...</td>\n",
       "      <td>4.781250</td>\n",
       "      <td>0.629775</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                               text  \\\n",
       "0   0  You are a helpful chatbot. Your purpose is to ...   \n",
       "1   1  You are a helpful chatbot. Your purpose is to ...   \n",
       "2   2  You are a helpful chatbot. Your purpose is to ...   \n",
       "3   3  You are a helpful chatbot. Your purpose is to ...   \n",
       "4   4  You are a helpful chatbot. Your purpose is to ...   \n",
       "\n",
       "                                      generated_text  seed  max_new_tokens  \\\n",
       "0     You can use a lint roller to remove the pills.    42            1268   \n",
       "1  I can help you with that. Let me know if there...    42            1268   \n",
       "2  I can help you with that. What color are you i...    42            1268   \n",
       "3  I can help you with that. Let me know what you...    42            1268   \n",
       "4  I can help you with that. Let me know if there...    42            1268   \n",
       "\n",
       "   do_sample  response_time  tokens_per_second  milliseconds_per_token  \\\n",
       "0      False       0.984879          16.245643               61.554966   \n",
       "1      False       1.486358          16.146855               61.931564   \n",
       "2      False       0.942199          15.920209               62.813247   \n",
       "3      False       1.304824          14.561350               68.674952   \n",
       "4      False       1.482429          16.189644               61.767880   \n",
       "\n",
       "   torch_allocated_GBs  ...                      GPU_memory_used_GBs  \\\n",
       "0                  0.0  ...  {0: 8.126, 1: 8.12, 2: 8.152, 3: 8.164}   \n",
       "1                  0.0  ...   {0: 8.156, 1: 8.15, 2: 8.18, 3: 8.196}   \n",
       "2                  0.0  ...   {0: 8.126, 1: 8.12, 2: 8.132, 3: 8.16}   \n",
       "3                  0.0  ...  {0: 8.156, 1: 8.15, 2: 8.162, 3: 8.164}   \n",
       "4                  0.0  ...   {0: 8.156, 1: 8.15, 2: 8.18, 3: 8.196}   \n",
       "\n",
       "  finish_reason generated_tokens  task_id  \\\n",
       "0     eos_token               16     6152   \n",
       "1     eos_token               24     8034   \n",
       "2     eos_token               15     6917   \n",
       "3     eos_token               19     1249   \n",
       "4     eos_token               24     5913   \n",
       "\n",
       "                                          hypothesis    dataset  \\\n",
       "0  {'instruction': 'You are a helpful chatbot. Yo...  full_chat   \n",
       "1  {'instruction': 'You are a helpful chatbot. Yo...  full_chat   \n",
       "2  {'instruction': 'You are a helpful chatbot. Yo...  full_chat   \n",
       "3  {'instruction': 'You are a helpful chatbot. Yo...  full_chat   \n",
       "4  {'instruction': 'You are a helpful chatbot. Yo...  full_chat   \n",
       "\n",
       "   model_name_or_path      weqweasdas/hh_rlhf_rm_open_llama_3b_eval_text  \\\n",
       "0  google/flan-t5-xxl  ###Human: How can I remove pilling from clothe...   \n",
       "1  google/flan-t5-xxl  ###Human: Can you give me instructions on how ...   \n",
       "2  google/flan-t5-xxl  ###Human: What colors are happy colors? ###Ass...   \n",
       "3  google/flan-t5-xxl  ###Human: I recently ran into my bully from sc...   \n",
       "4  google/flan-t5-xxl  ###Human: What kinds of video games are okay f...   \n",
       "\n",
       "  weqweasdas/hh_rlhf_rm_open_llama_3b_eval_reward  \\\n",
       "0                                        5.468750   \n",
       "1                                        3.796875   \n",
       "2                                        3.968750   \n",
       "3                                        4.531250   \n",
       "4                                        4.781250   \n",
       "\n",
       "   weqweasdas/hh_rlhf_rm_open_llama_3b_eval_reward_med_sigmoid  \n",
       "0                                           0.771843            \n",
       "1                                           0.388618            \n",
       "2                                           0.430147            \n",
       "3                                           0.569853            \n",
       "4                                           0.629775            \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "full_chat_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0        You are a helpful chatbot. Your purpose is to ...\n",
       "1        You are a helpful chatbot. Your purpose is to ...\n",
       "2        You are a helpful chatbot. Your purpose is to ...\n",
       "3        You are a helpful chatbot. Your purpose is to ...\n",
       "4        You are a helpful chatbot. Your purpose is to ...\n",
       "                               ...                        \n",
       "99995    You are a helpful chatbot. Your mission is to ...\n",
       "99996    You are a helpful chatbot. Your mission is to ...\n",
       "99997    You are a helpful chatbot. Your mission is to ...\n",
       "99998    You are a helpful chatbot. Your mission is to ...\n",
       "99999    You are a helpful chatbot. Your mission is to ...\n",
       "Name: hypothesis, Length: 100000, dtype: object"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "full_chat_df['hypothesis'].apply(lambda x: eval(x)['instruction'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weqweasdas/hh_rlhf_rm_open_llama_3b_eval_reward</th>\n",
       "      <th>weqweasdas/hh_rlhf_rm_open_llama_3b_eval_reward_med_sigmoid</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>100000.000000</td>\n",
       "      <td>100000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>4.416781</td>\n",
       "      <td>0.516457</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>1.607894</td>\n",
       "      <td>0.273088</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>-2.500000</td>\n",
       "      <td>0.001170</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>3.328125</td>\n",
       "      <td>0.284576</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>4.250000</td>\n",
       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>5.375000</td>\n",
       "      <td>0.754915</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>14.125000</td>\n",
       "      <td>0.999949</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       weqweasdas/hh_rlhf_rm_open_llama_3b_eval_reward  \\\n",
       "count                                    100000.000000   \n",
       "mean                                          4.416781   \n",
       "std                                           1.607894   \n",
       "min                                          -2.500000   \n",
       "25%                                           3.328125   \n",
       "50%                                           4.250000   \n",
       "75%                                           5.375000   \n",
       "max                                          14.125000   \n",
       "\n",
       "       weqweasdas/hh_rlhf_rm_open_llama_3b_eval_reward_med_sigmoid  \n",
       "count                                      100000.000000            \n",
       "mean                                            0.516457            \n",
       "std                                             0.273088            \n",
       "min                                             0.001170            \n",
       "25%                                             0.284576            \n",
       "50%                                             0.500000            \n",
       "75%                                             0.754915            \n",
       "max                                             0.999949            "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reward_cols = ['weqweasdas/hh_rlhf_rm_open_llama_3b_eval_reward', 'weqweasdas/hh_rlhf_rm_open_llama_3b_eval_reward_med_sigmoid']\n",
    "full_chat_df[reward_cols].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MBPP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "mbpp_dir = os.path.join(output_dir, 'mbpp')\n",
    "mbpp_df = pd.read_csv(os.path.join(mbpp_dir, 'codellama-CodeLlama-7b-Instruct-hf_predictions.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "28920"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(mbpp_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['id', 'text', 'generated_text', 'seed', 'max_new_tokens', 'do_sample',\n",
       "       'stop_sequences', 'temperature', 'top_p', 'response_time',\n",
       "       'tokens_per_second', 'milliseconds_per_token', 'torch_allocated_GBs',\n",
       "       'total_GPU_memory_used_GBs', 'GPU_memory_used_GBs', 'finish_reason',\n",
       "       'generated_tokens', 'task_id', 'hypothesis', 'dataset',\n",
       "       'model_name_or_path', 'generated_code', 'test_code', 'passed', 'result',\n",
       "       'hash'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mbpp_df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>text</th>\n",
       "      <th>generated_text</th>\n",
       "      <th>seed</th>\n",
       "      <th>max_new_tokens</th>\n",
       "      <th>do_sample</th>\n",
       "      <th>stop_sequences</th>\n",
       "      <th>temperature</th>\n",
       "      <th>top_p</th>\n",
       "      <th>response_time</th>\n",
       "      <th>...</th>\n",
       "      <th>generated_tokens</th>\n",
       "      <th>task_id</th>\n",
       "      <th>hypothesis</th>\n",
       "      <th>dataset</th>\n",
       "      <th>model_name_or_path</th>\n",
       "      <th>generated_code</th>\n",
       "      <th>test_code</th>\n",
       "      <th>passed</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>[INST] &lt;&lt;SYS&gt;&gt;\\nYour goal is to write code tha...</td>\n",
       "      <td>[PYTHON]\\ndef max_volume(side_sum):\\n    retur...</td>\n",
       "      <td>42</td>\n",
       "      <td>834</td>\n",
       "      <td>True</td>\n",
       "      <td>['[/PYTHON]', '&lt;/s&gt;']</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.95</td>\n",
       "      <td>1.852320</td>\n",
       "      <td>...</td>\n",
       "      <td>38</td>\n",
       "      <td>318</td>\n",
       "      <td>{'instruction': 'Your goal is to write code th...</td>\n",
       "      <td>mbpp</td>\n",
       "      <td>codellama/CodeLlama-7b-Instruct-hf</td>\n",
       "      <td>def max_volume(side_sum):\\n    return (side_su...</td>\n",
       "      <td>assert max_volume(8) == 18\\nassert max_volume(...</td>\n",
       "      <td>False</td>\n",
       "      <td>failed:</td>\n",
       "      <td>82ffe7d05e3196418e39d9aa46bf4126eb2759a419ca25...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20</td>\n",
       "      <td>[INST] &lt;&lt;SYS&gt;&gt;\\nYour goal is to write code tha...</td>\n",
       "      <td>[PYTHON]\\ndef max_volume(sum_of_sides):\\n    #...</td>\n",
       "      <td>43</td>\n",
       "      <td>834</td>\n",
       "      <td>True</td>\n",
       "      <td>['[/PYTHON]', '&lt;/s&gt;']</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.95</td>\n",
       "      <td>5.123448</td>\n",
       "      <td>...</td>\n",
       "      <td>119</td>\n",
       "      <td>318</td>\n",
       "      <td>{'instruction': 'Your goal is to write code th...</td>\n",
       "      <td>mbpp</td>\n",
       "      <td>codellama/CodeLlama-7b-Instruct-hf</td>\n",
       "      <td>def max_volume(sum_of_sides):\\n    # Find the ...</td>\n",
       "      <td>assert max_volume(8) == 18\\nassert max_volume(...</td>\n",
       "      <td>False</td>\n",
       "      <td>failed:</td>\n",
       "      <td>c57d9f5efe770f1ce1a429101f3bd4aee5c529e81a3a84...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>[INST] &lt;&lt;SYS&gt;&gt;\\nYou are tasked with writing co...</td>\n",
       "      <td>[PYTHON]\\ndef max_volume(side_sum):\\n    retur...</td>\n",
       "      <td>42</td>\n",
       "      <td>834</td>\n",
       "      <td>True</td>\n",
       "      <td>['[/PYTHON]', '&lt;/s&gt;']</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.95</td>\n",
       "      <td>1.870779</td>\n",
       "      <td>...</td>\n",
       "      <td>38</td>\n",
       "      <td>318</td>\n",
       "      <td>{'instruction': 'You are tasked with writing c...</td>\n",
       "      <td>mbpp</td>\n",
       "      <td>codellama/CodeLlama-7b-Instruct-hf</td>\n",
       "      <td>def max_volume(side_sum):\\n    return (side_su...</td>\n",
       "      <td>assert max_volume(8) == 18\\nassert max_volume(...</td>\n",
       "      <td>False</td>\n",
       "      <td>failed:</td>\n",
       "      <td>fdc20642fbdf1295bb92b21837336e90576db0996d188f...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>20</td>\n",
       "      <td>[INST] &lt;&lt;SYS&gt;&gt;\\nYou are tasked with writing co...</td>\n",
       "      <td>[PYTHON]\\ndef max_volume(s):\\n    return (s **...</td>\n",
       "      <td>43</td>\n",
       "      <td>834</td>\n",
       "      <td>True</td>\n",
       "      <td>['[/PYTHON]', '&lt;/s&gt;']</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.95</td>\n",
       "      <td>2.066289</td>\n",
       "      <td>...</td>\n",
       "      <td>33</td>\n",
       "      <td>318</td>\n",
       "      <td>{'instruction': 'You are tasked with writing c...</td>\n",
       "      <td>mbpp</td>\n",
       "      <td>codellama/CodeLlama-7b-Instruct-hf</td>\n",
       "      <td>def max_volume(s):\\n    return (s ** 3) / 3</td>\n",
       "      <td>assert max_volume(8) == 18\\nassert max_volume(...</td>\n",
       "      <td>False</td>\n",
       "      <td>failed:</td>\n",
       "      <td>e75b672fa3846b616e2f7123616d13f726edd37433f252...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>[INST] &lt;&lt;SYS&gt;&gt;\\nYour goal is to write code tha...</td>\n",
       "      <td>[PYTHON]\\ndef add_tuple(my_list, new_tuple):\\n...</td>\n",
       "      <td>42</td>\n",
       "      <td>834</td>\n",
       "      <td>True</td>\n",
       "      <td>['[/PYTHON]', '&lt;/s&gt;']</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.95</td>\n",
       "      <td>1.928691</td>\n",
       "      <td>...</td>\n",
       "      <td>40</td>\n",
       "      <td>750</td>\n",
       "      <td>{'instruction': 'Your goal is to write code th...</td>\n",
       "      <td>mbpp</td>\n",
       "      <td>codellama/CodeLlama-7b-Instruct-hf</td>\n",
       "      <td>def add_tuple(my_list, new_tuple):\\n    return...</td>\n",
       "      <td>assert add_tuple([5, 6, 7], (9, 10)) == [5, 6,...</td>\n",
       "      <td>True</td>\n",
       "      <td>passed</td>\n",
       "      <td>e5804126c60e10a4949fb558f1f501049449802d8d1f39...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 26 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                               text  \\\n",
       "0   0  [INST] <<SYS>>\\nYour goal is to write code tha...   \n",
       "1  20  [INST] <<SYS>>\\nYour goal is to write code tha...   \n",
       "2   0  [INST] <<SYS>>\\nYou are tasked with writing co...   \n",
       "3  20  [INST] <<SYS>>\\nYou are tasked with writing co...   \n",
       "4   1  [INST] <<SYS>>\\nYour goal is to write code tha...   \n",
       "\n",
       "                                      generated_text  seed  max_new_tokens  \\\n",
       "0  [PYTHON]\\ndef max_volume(side_sum):\\n    retur...    42             834   \n",
       "1  [PYTHON]\\ndef max_volume(sum_of_sides):\\n    #...    43             834   \n",
       "2  [PYTHON]\\ndef max_volume(side_sum):\\n    retur...    42             834   \n",
       "3  [PYTHON]\\ndef max_volume(s):\\n    return (s **...    43             834   \n",
       "4  [PYTHON]\\ndef add_tuple(my_list, new_tuple):\\n...    42             834   \n",
       "\n",
       "   do_sample         stop_sequences  temperature  top_p  response_time  ...  \\\n",
       "0       True  ['[/PYTHON]', '</s>']          0.8   0.95       1.852320  ...   \n",
       "1       True  ['[/PYTHON]', '</s>']          0.8   0.95       5.123448  ...   \n",
       "2       True  ['[/PYTHON]', '</s>']          0.8   0.95       1.870779  ...   \n",
       "3       True  ['[/PYTHON]', '</s>']          0.8   0.95       2.066289  ...   \n",
       "4       True  ['[/PYTHON]', '</s>']          0.8   0.95       1.928691  ...   \n",
       "\n",
       "   generated_tokens  task_id  \\\n",
       "0                38      318   \n",
       "1               119      318   \n",
       "2                38      318   \n",
       "3                33      318   \n",
       "4                40      750   \n",
       "\n",
       "                                          hypothesis  dataset  \\\n",
       "0  {'instruction': 'Your goal is to write code th...     mbpp   \n",
       "1  {'instruction': 'Your goal is to write code th...     mbpp   \n",
       "2  {'instruction': 'You are tasked with writing c...     mbpp   \n",
       "3  {'instruction': 'You are tasked with writing c...     mbpp   \n",
       "4  {'instruction': 'Your goal is to write code th...     mbpp   \n",
       "\n",
       "                   model_name_or_path  \\\n",
       "0  codellama/CodeLlama-7b-Instruct-hf   \n",
       "1  codellama/CodeLlama-7b-Instruct-hf   \n",
       "2  codellama/CodeLlama-7b-Instruct-hf   \n",
       "3  codellama/CodeLlama-7b-Instruct-hf   \n",
       "4  codellama/CodeLlama-7b-Instruct-hf   \n",
       "\n",
       "                                      generated_code  \\\n",
       "0  def max_volume(side_sum):\\n    return (side_su...   \n",
       "1  def max_volume(sum_of_sides):\\n    # Find the ...   \n",
       "2  def max_volume(side_sum):\\n    return (side_su...   \n",
       "3        def max_volume(s):\\n    return (s ** 3) / 3   \n",
       "4  def add_tuple(my_list, new_tuple):\\n    return...   \n",
       "\n",
       "                                           test_code  passed    result  \\\n",
       "0  assert max_volume(8) == 18\\nassert max_volume(...   False  failed:    \n",
       "1  assert max_volume(8) == 18\\nassert max_volume(...   False  failed:    \n",
       "2  assert max_volume(8) == 18\\nassert max_volume(...   False  failed:    \n",
       "3  assert max_volume(8) == 18\\nassert max_volume(...   False  failed:    \n",
       "4  assert add_tuple([5, 6, 7], (9, 10)) == [5, 6,...    True    passed   \n",
       "\n",
       "                                                hash  \n",
       "0  82ffe7d05e3196418e39d9aa46bf4126eb2759a419ca25...  \n",
       "1  c57d9f5efe770f1ce1a429101f3bd4aee5c529e81a3a84...  \n",
       "2  fdc20642fbdf1295bb92b21837336e90576db0996d188f...  \n",
       "3  e75b672fa3846b616e2f7123616d13f726edd37433f252...  \n",
       "4  e5804126c60e10a4949fb558f1f501049449802d8d1f39...  \n",
       "\n",
       "[5 rows x 26 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mbpp_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[INST] <<SYS>>\n",
      "Your goal is to write code that performs the specified task.\n",
      "<</SYS>>\n",
      "\n",
      "Write a function to add the given tuple to the given list.\n",
      "Your code should pass these tests:\n",
      "\n",
      "assert add_tuple([5, 6, 7], (9, 10)) == [5, 6, 7, 9, 10]\n",
      "assert add_tuple([6, 7, 8], (10, 11)) == [6, 7, 8, 10, 11]\n",
      "assert add_tuple([7, 8, 9], (11, 12)) == [7, 8, 9, 11, 12]\n",
      "Your code should start with a [PYTHON] tag and end with a [/PYTHON] tag. [/INST]\n"
     ]
    }
   ],
   "source": [
    "print(mbpp_df['text'].iloc[4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[PYTHON]\n",
      "def add_tuple(my_list, new_tuple):\n",
      "    return my_list + list(new_tuple)\n",
      "[/PYTHON]\n"
     ]
    }
   ],
   "source": [
    "print(mbpp_df['generated_text'].iloc[4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO: from scripts.code_eval.evaluation import pass_at_k"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
