{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MBPP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_dir = '../llm_output'\n",
    "mbpp_dir = os.path.join(output_dir, 'mbpp')\n",
    "mbpp_df = pd.read_csv(os.path.join(mbpp_dir, 'codellama-CodeLlama-7b-Instruct-hf_predictions.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "289200"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(mbpp_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['id', 'text', 'generated_text', 'seed', 'max_new_tokens', 'do_sample',\n",
       "       'stop_sequences', 'temperature', 'top_p', 'response_time',\n",
       "       'tokens_per_second', 'milliseconds_per_token', 'torch_allocated_GBs',\n",
       "       'total_GPU_memory_used_GBs', 'GPU_memory_used_GBs', 'finish_reason',\n",
       "       'generated_tokens', 'task_id', 'hypothesis', 'dataset',\n",
       "       'model_name_or_path', 'generated_code', 'test_code', 'passed', 'result',\n",
       "       'hash'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mbpp_df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mbpp_df[\"passed\"].isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>text</th>\n",
       "      <th>generated_text</th>\n",
       "      <th>seed</th>\n",
       "      <th>max_new_tokens</th>\n",
       "      <th>do_sample</th>\n",
       "      <th>stop_sequences</th>\n",
       "      <th>temperature</th>\n",
       "      <th>top_p</th>\n",
       "      <th>response_time</th>\n",
       "      <th>...</th>\n",
       "      <th>generated_tokens</th>\n",
       "      <th>task_id</th>\n",
       "      <th>hypothesis</th>\n",
       "      <th>dataset</th>\n",
       "      <th>model_name_or_path</th>\n",
       "      <th>generated_code</th>\n",
       "      <th>test_code</th>\n",
       "      <th>passed</th>\n",
       "      <th>result</th>\n",
       "      <th>hash</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>[INST] &lt;&lt;SYS&gt;&gt;\\nYour goal is to write code tha...</td>\n",
       "      <td>[PYTHON]\\ndef max_volume(side_sum):\\n    retur...</td>\n",
       "      <td>42</td>\n",
       "      <td>834</td>\n",
       "      <td>True</td>\n",
       "      <td>['[/PYTHON]', '&lt;/s&gt;']</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.95</td>\n",
       "      <td>1.852320</td>\n",
       "      <td>...</td>\n",
       "      <td>38</td>\n",
       "      <td>318</td>\n",
       "      <td>{'instruction': 'Your goal is to write code th...</td>\n",
       "      <td>mbpp</td>\n",
       "      <td>codellama/CodeLlama-7b-Instruct-hf</td>\n",
       "      <td>def max_volume(side_sum):\\n    return (side_su...</td>\n",
       "      <td>assert max_volume(8) == 18\\nassert max_volume(...</td>\n",
       "      <td>False</td>\n",
       "      <td>failed:</td>\n",
       "      <td>82ffe7d05e3196418e39d9aa46bf4126eb2759a419ca25...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20</td>\n",
       "      <td>[INST] &lt;&lt;SYS&gt;&gt;\\nYour goal is to write code tha...</td>\n",
       "      <td>[PYTHON]\\ndef max_volume(sum_of_sides):\\n    #...</td>\n",
       "      <td>43</td>\n",
       "      <td>834</td>\n",
       "      <td>True</td>\n",
       "      <td>['[/PYTHON]', '&lt;/s&gt;']</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.95</td>\n",
       "      <td>5.123448</td>\n",
       "      <td>...</td>\n",
       "      <td>119</td>\n",
       "      <td>318</td>\n",
       "      <td>{'instruction': 'Your goal is to write code th...</td>\n",
       "      <td>mbpp</td>\n",
       "      <td>codellama/CodeLlama-7b-Instruct-hf</td>\n",
       "      <td>def max_volume(sum_of_sides):\\n    # Find the ...</td>\n",
       "      <td>assert max_volume(8) == 18\\nassert max_volume(...</td>\n",
       "      <td>False</td>\n",
       "      <td>failed:</td>\n",
       "      <td>c57d9f5efe770f1ce1a429101f3bd4aee5c529e81a3a84...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>[INST] &lt;&lt;SYS&gt;&gt;\\nYou are tasked with writing co...</td>\n",
       "      <td>[PYTHON]\\ndef max_volume(side_sum):\\n    retur...</td>\n",
       "      <td>42</td>\n",
       "      <td>834</td>\n",
       "      <td>True</td>\n",
       "      <td>['[/PYTHON]', '&lt;/s&gt;']</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.95</td>\n",
       "      <td>1.870779</td>\n",
       "      <td>...</td>\n",
       "      <td>38</td>\n",
       "      <td>318</td>\n",
       "      <td>{'instruction': 'You are tasked with writing c...</td>\n",
       "      <td>mbpp</td>\n",
       "      <td>codellama/CodeLlama-7b-Instruct-hf</td>\n",
       "      <td>def max_volume(side_sum):\\n    return (side_su...</td>\n",
       "      <td>assert max_volume(8) == 18\\nassert max_volume(...</td>\n",
       "      <td>False</td>\n",
       "      <td>failed:</td>\n",
       "      <td>fdc20642fbdf1295bb92b21837336e90576db0996d188f...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>20</td>\n",
       "      <td>[INST] &lt;&lt;SYS&gt;&gt;\\nYou are tasked with writing co...</td>\n",
       "      <td>[PYTHON]\\ndef max_volume(s):\\n    return (s **...</td>\n",
       "      <td>43</td>\n",
       "      <td>834</td>\n",
       "      <td>True</td>\n",
       "      <td>['[/PYTHON]', '&lt;/s&gt;']</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.95</td>\n",
       "      <td>2.066289</td>\n",
       "      <td>...</td>\n",
       "      <td>33</td>\n",
       "      <td>318</td>\n",
       "      <td>{'instruction': 'You are tasked with writing c...</td>\n",
       "      <td>mbpp</td>\n",
       "      <td>codellama/CodeLlama-7b-Instruct-hf</td>\n",
       "      <td>def max_volume(s):\\n    return (s ** 3) / 3</td>\n",
       "      <td>assert max_volume(8) == 18\\nassert max_volume(...</td>\n",
       "      <td>False</td>\n",
       "      <td>failed:</td>\n",
       "      <td>e75b672fa3846b616e2f7123616d13f726edd37433f252...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1888</td>\n",
       "      <td>[INST] &lt;&lt;SYS&gt;&gt;\\nYour goal is to write code tha...</td>\n",
       "      <td>[PYTHON]\\ndef max_volume(sum_of_sides):\\n    #...</td>\n",
       "      <td>44</td>\n",
       "      <td>512</td>\n",
       "      <td>True</td>\n",
       "      <td>['[/PYTHON]', '&lt;/s&gt;']</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.95</td>\n",
       "      <td>10.906498</td>\n",
       "      <td>...</td>\n",
       "      <td>187</td>\n",
       "      <td>318</td>\n",
       "      <td>{'instruction': 'Your goal is to write code th...</td>\n",
       "      <td>mbpp</td>\n",
       "      <td>codellama/CodeLlama-7b-Instruct-hf</td>\n",
       "      <td>def max_volume(sum_of_sides):\\n    # Find the ...</td>\n",
       "      <td>assert max_volume(8) == 18\\nassert max_volume(...</td>\n",
       "      <td>False</td>\n",
       "      <td>failed:</td>\n",
       "      <td>8b389fe9be6e07349871ef5fec081e26554631f554fa39...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 26 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     id                                               text  \\\n",
       "0     0  [INST] <<SYS>>\\nYour goal is to write code tha...   \n",
       "1    20  [INST] <<SYS>>\\nYour goal is to write code tha...   \n",
       "2     0  [INST] <<SYS>>\\nYou are tasked with writing co...   \n",
       "3    20  [INST] <<SYS>>\\nYou are tasked with writing co...   \n",
       "4  1888  [INST] <<SYS>>\\nYour goal is to write code tha...   \n",
       "\n",
       "                                      generated_text  seed  max_new_tokens  \\\n",
       "0  [PYTHON]\\ndef max_volume(side_sum):\\n    retur...    42             834   \n",
       "1  [PYTHON]\\ndef max_volume(sum_of_sides):\\n    #...    43             834   \n",
       "2  [PYTHON]\\ndef max_volume(side_sum):\\n    retur...    42             834   \n",
       "3  [PYTHON]\\ndef max_volume(s):\\n    return (s **...    43             834   \n",
       "4  [PYTHON]\\ndef max_volume(sum_of_sides):\\n    #...    44             512   \n",
       "\n",
       "   do_sample         stop_sequences  temperature  top_p  response_time  ...  \\\n",
       "0       True  ['[/PYTHON]', '</s>']          0.8   0.95       1.852320  ...   \n",
       "1       True  ['[/PYTHON]', '</s>']          0.8   0.95       5.123448  ...   \n",
       "2       True  ['[/PYTHON]', '</s>']          0.8   0.95       1.870779  ...   \n",
       "3       True  ['[/PYTHON]', '</s>']          0.8   0.95       2.066289  ...   \n",
       "4       True  ['[/PYTHON]', '</s>']          0.8   0.95      10.906498  ...   \n",
       "\n",
       "   generated_tokens  task_id  \\\n",
       "0                38      318   \n",
       "1               119      318   \n",
       "2                38      318   \n",
       "3                33      318   \n",
       "4               187      318   \n",
       "\n",
       "                                          hypothesis  dataset  \\\n",
       "0  {'instruction': 'Your goal is to write code th...     mbpp   \n",
       "1  {'instruction': 'Your goal is to write code th...     mbpp   \n",
       "2  {'instruction': 'You are tasked with writing c...     mbpp   \n",
       "3  {'instruction': 'You are tasked with writing c...     mbpp   \n",
       "4  {'instruction': 'Your goal is to write code th...     mbpp   \n",
       "\n",
       "                   model_name_or_path  \\\n",
       "0  codellama/CodeLlama-7b-Instruct-hf   \n",
       "1  codellama/CodeLlama-7b-Instruct-hf   \n",
       "2  codellama/CodeLlama-7b-Instruct-hf   \n",
       "3  codellama/CodeLlama-7b-Instruct-hf   \n",
       "4  codellama/CodeLlama-7b-Instruct-hf   \n",
       "\n",
       "                                      generated_code  \\\n",
       "0  def max_volume(side_sum):\\n    return (side_su...   \n",
       "1  def max_volume(sum_of_sides):\\n    # Find the ...   \n",
       "2  def max_volume(side_sum):\\n    return (side_su...   \n",
       "3        def max_volume(s):\\n    return (s ** 3) / 3   \n",
       "4  def max_volume(sum_of_sides):\\n    # Find the ...   \n",
       "\n",
       "                                           test_code  passed    result  \\\n",
       "0  assert max_volume(8) == 18\\nassert max_volume(...   False  failed:    \n",
       "1  assert max_volume(8) == 18\\nassert max_volume(...   False  failed:    \n",
       "2  assert max_volume(8) == 18\\nassert max_volume(...   False  failed:    \n",
       "3  assert max_volume(8) == 18\\nassert max_volume(...   False  failed:    \n",
       "4  assert max_volume(8) == 18\\nassert max_volume(...   False  failed:    \n",
       "\n",
       "                                                hash  \n",
       "0  82ffe7d05e3196418e39d9aa46bf4126eb2759a419ca25...  \n",
       "1  c57d9f5efe770f1ce1a429101f3bd4aee5c529e81a3a84...  \n",
       "2  fdc20642fbdf1295bb92b21837336e90576db0996d188f...  \n",
       "3  e75b672fa3846b616e2f7123616d13f726edd37433f252...  \n",
       "4  8b389fe9be6e07349871ef5fec081e26554631f554fa39...  \n",
       "\n",
       "[5 rows x 26 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mbpp_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[INST] <<SYS>>\n",
      "Your goal is to write code that performs the specified task.\n",
      "<</SYS>>\n",
      "\n",
      "Write a python function to find the maximum volume of a cuboid with given sum of sides.\n",
      "Your code should pass these tests:\n",
      "\n",
      "assert max_volume(8) == 18\n",
      "assert max_volume(4) == 2\n",
      "assert max_volume(1) == 0\n",
      "Your code should start with a [PYTHON] tag and end with a [/PYTHON] tag. [/INST]\n"
     ]
    }
   ],
   "source": [
    "print(mbpp_df['text'].iloc[4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[PYTHON]\n",
      "def max_volume(sum_of_sides):\n",
      "    # Find the maximum volume of a cuboid with the given sum of sides\n",
      "    # by using the formula for the volume of a cuboid:\n",
      "    # volume = length * width * height\n",
      "    # We know that the sum of the sides is equal to the length, width, and height,\n",
      "    # so we can set up the following system of equations:\n",
      "    # length + width + height = sum_of_sides\n",
      "    # length * width * height = max_volume\n",
      "    # We can solve this system of equations to find the maximum volume:\n",
      "    length = sum_of_sides // 3\n",
      "    width = length\n",
      "    height = length\n",
      "    max_volume = length * width * height\n",
      "    return max_volume\n",
      "[/PYTHON]\n"
     ]
    }
   ],
   "source": [
    "print(mbpp_df['generated_text'].iloc[4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt_list = mbpp_df[\"hypothesis\"].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'instruction': 'You are required to write code that generates the specified output.'}\n",
      "no. generations: 9640\n",
      "no. tasks: 964\n",
      "0.4170124481327801\n",
      "-----\n",
      "{'instruction': 'You are required to write code that generates the specified output.', 'k_shot_idxs': [2]}\n",
      "no. generations: 9640\n",
      "no. tasks: 964\n",
      "0.4066390041493776\n",
      "-----\n",
      "{'instruction': 'You are required to write code that generates the specified output.', 'k_shot_idxs': [2, 7, 10]}\n",
      "no. generations: 9640\n",
      "no. tasks: 964\n",
      "0.3900414937759336\n",
      "-----\n"
     ]
    }
   ],
   "source": [
    "res = []\n",
    "for p in prompt_list:\n",
    "\n",
    "    if eval(p)[\"instruction\"] != 'You are required to write code that generates the specified output.':\n",
    "        continue\n",
    "\n",
    "    print(p)\n",
    "\n",
    "    prompt_df = mbpp_df[mbpp_df[\"hypothesis\"] == p]\n",
    "    print(\"no. generations:\", len(prompt_df))\n",
    "\n",
    "    prompt_tasks = prompt_df[\"task_id\"].unique()\n",
    "    print(\"no. tasks:\", len(prompt_tasks))\n",
    "\n",
    "    scores = []\n",
    "    for task_id in prompt_tasks:\n",
    "        \n",
    "        test_results = prompt_df[prompt_df[\"task_id\"] == task_id][\"passed\"].tolist()\n",
    "        \n",
    "        if True in test_results:\n",
    "            scores.append(1)\n",
    "        elif False in test_results:\n",
    "            scores.append(0)\n",
    "        else:\n",
    "            continue\n",
    "\n",
    "    scores = 1-np.array(scores)\n",
    "    print(np.mean(scores))\n",
    "\n",
    "    print(\"-----\")\n",
    "\n",
    "    queries = [q for q in prompt_df[\"text\"].tolist()]\n",
    "    responses = prompt_df[\"generated_text\"].tolist()\n",
    "\n",
    "\n",
    "    res.append([\n",
    "        p,\n",
    "        scores,\n",
    "        queries,\n",
    "        responses\n",
    "    ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle as pkl\n",
    "\n",
    "filename = \"../output/mbpp/mbpp_codellama_pass-10_loss_dist.pkl\"\n",
    "with open(filename, 'wb') as handle:\n",
    "    pkl.dump(res, handle, protocol=pkl.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
