{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# LLM evaluation demo\n",
    "\n",
    "First install dependences, download and unpack data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install datasets torch openai tiktoken nltk seaborn matplotlib "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!git clone https://github.com/booydar/babilong source\n",
    "!unzip source/data/tasks_1-20_v1-2.zip -d data/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import openai \n",
    "import asyncio\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "from datasets import load_dataset, Dataset\n",
    "import numpy as np\n",
    "import tiktoken\n",
    "import time\n",
    "import os\n",
    "import datasets\n",
    "from source.babilong_utils import TaskDataset, SentenceSampler, NoiseInjectionDataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Choose parameters for evaluation:\n",
    "* task: babilong task (qa1 - qa10). Use different prompts for each task! You can find our prompts in the Appendix B of our paper\n",
    "* task_path: path to the file with task data\n",
    "* message_lengths: list of task lengths to evaluate. It must be less then the LLM's context window\n",
    "* number_of_samples: number of instances of the task with message_length to evaluate\n",
    "\n",
    "# Choose LLM\n",
    "* model: ChatGPT model\n",
    "    * 'gpt-4-1106-preview' - GPT4-Turbo, context length up to 128k\n",
    "    * 'gpt-4' - GPT4, context length up to 32k\n",
    "    * 'gpt-3.5-turbo-1106' - GPT3.5, context length up to 16k\n",
    "    * 'ft:gpt-3.5-turbo-1106:personal:babilong-qa1:8nZsdRrH' - GPT3.5 fintunned for qa1 task, context length up to 16k\n",
    "* token: your token for OpenAI api"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "task = 'qa1'\n",
    "task_path = 'data/tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_test.txt'\n",
    "message_lengths = [4000, 8000, 16000, 32000, 64000, 128000]\n",
    "number_of_samples = 25\n",
    "\n",
    "model = 'gpt-4-1106-preview'\n",
    "token = 'YOUR_CHAT_GPT_TOKEN'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Adapt tiktoken api to babilong utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Tokenizer():\n",
    "    def __init__(self, model):\n",
    "        self.impl_ = tiktoken.encoding_for_model(model)\n",
    "\n",
    "    def __call__(self, inp):\n",
    "        if isinstance(inp, list):\n",
    "            result = self.impl_.encode_batch(inp)\n",
    "        else:\n",
    "            result = self.impl_.encode(inp)\n",
    "        return {\n",
    "            'input_ids': result\n",
    "        }\n",
    "\n",
    "    def encode(self, inp, add_special_tokens):\n",
    "        return self.impl_.encode(inp)\n",
    "\n",
    "    def decode(self, inp):\n",
    "        return self.impl_.decode(inp)\n",
    "\n",
    "    def decode_batch(self, inp):\n",
    "        return self.impl_.decode_batch(inp)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Choose noise dataset\n",
    "\n",
    "In this example we use wikitext to generate noise. \n",
    "To reproduce results from our paper use PG19."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "noise_dataset = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1')\n",
    "#noise_dataset = datasets.load_from_disk(\"pg19-data-test\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Run evaluation loop and save results in .csv format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create directory to save LLM responses\n",
    "os.makedirs(task, exist_ok=True)\n",
    "\n",
    "client = openai.OpenAI(api_key=token)\n",
    "\n",
    "tokenizer = Tokenizer(model)\n",
    "\n",
    "task_dataset_test = TaskDataset(task_path)\n",
    "noise_sampler_test = SentenceSampler(noise_dataset['test'], tokenizer=tokenizer)\n",
    "\n",
    "for message_length in message_lengths:\n",
    "    outfile =  f'{task}/msg_{message_length}.csv'\n",
    "    df = pd.DataFrame({\n",
    "        'answer': [],\n",
    "        'gpt4answer': [],\n",
    "        'result': [],\n",
    "    })\n",
    "\n",
    "    # create dataset with given message length\n",
    "    dataset_test = NoiseInjectionDataset(task_dataset=task_dataset_test,\n",
    "                                         noise_sampler=noise_sampler_test,\n",
    "                                         tokenizer=tokenizer,\n",
    "                                         sample_size=message_length)\n",
    "    \n",
    "    # collect GPT responses and save them in .csv\n",
    "    for i, sample in zip(range(number_of_samples), dataset_test):\n",
    "        facts = sample['facts']\n",
    "        question = sample['question']\n",
    "        true_answer = tokenizer.decode(sample['target_tokens'])\n",
    "        background_text = tokenizer.decode_batch(sample['background_text'])\n",
    "        query = tokenizer.decode(sample['input_tokens'])\n",
    "        \n",
    "        messages = [ \n",
    "            {\n",
    "                \"role\": \"system\",\n",
    "                \"content\": \"You are a intelligent assistant.\"\n",
    "            },\n",
    "            {\n",
    "                \"role\": \"user\", \n",
    "                \"content\": \n",
    "                    \"I give you context with the facts about positions of different persons hidden in some random text and a question. \"\n",
    "                    \"You need to answer the question based only on the information from the facts. \"\n",
    "                    \"If a person was in different locations use the latest location to answer the question.\\n\\n\"\n",
    "                    \"<example>\\n\"\n",
    "                    \"Charlie went to the hallway. Judith come back to the kitchen. Charlie travelled to balcony. Where is Charlie?\\n\"\n",
    "                    \"Assistant: balcony\\n\"\n",
    "                    \"</example>\\n\\n\"\n",
    "                    \"<example>\\n\"\n",
    "                    \"Alan moved to the garage. Charlie went to the beach. Alan went to the shop. Rouse travelled to balcony. Where is Alan?\\n\"\n",
    "                    \"Assistant: shop\\n\"\n",
    "                    \"</example>\\n\\n\"\n",
    "                    \"<context>\\n\"\n",
    "                    f\"{query}\"\n",
    "                    \"</context>\\n\\n\"\n",
    "                    f\"QUESTION: {question}\\n\" \n",
    "                    \"Your answer should be a single word - the most recent location of ’person’ in the question. \"\n",
    "                    \"Do not write anything afer that.\"\n",
    "            },\n",
    "        ]\n",
    "    \n",
    "        response = client.chat.completions.create(model=model, messages=messages)\n",
    "        gpt_answer = response.choices[0].message.content.strip().lower()\n",
    "    \n",
    "        if gpt_answer.endswith('.'):\n",
    "            gpt_answer = gpt_answer[:-1]\n",
    "    \n",
    "        print(message_length, i, true_answer, gpt_answer)\n",
    "    \n",
    "        df.loc[len(df)] = [true_answer, gpt_answer, true_answer == gpt_answer]\n",
    "        df.to_csv(outfile)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Plot evaluation results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib\n",
    "import matplotlib.pylab as plt\n",
    "from matplotlib.colors import LinearSegmentedColormap\n",
    "\n",
    "accuracy = np.zeros((1, len(message_lengths)))\n",
    "\n",
    "for i, msg in enumerate(message_lengths):\n",
    "    fname = f'{task}/msg_{msg}.csv'\n",
    "    if not os.path.isfile(fname):\n",
    "        print('not such file', fname)\n",
    "        continue     \n",
    "    df = pd.read_csv(fname, index_col=[0])\n",
    "    last_word = df['gpt4answer'].apply(lambda x: x.split(' ')[-1]).apply(lambda x: x.split('\\n')[-1])\n",
    "    score = (last_word == df['answer']).sum()\n",
    "    accuracy[0, i] = score / len(df)\n",
    "\n",
    "font = {\n",
    "    'size'   : 30\n",
    "}\n",
    "\n",
    "matplotlib.rc('font', **font)\n",
    "fig = matplotlib.pyplot.gcf()\n",
    "fig.set_size_inches(18.5, 2)\n",
    "\n",
    "\n",
    "cmap = LinearSegmentedColormap.from_list('ryg', [\"red\", \"yellow\", \"green\"], N=256)\n",
    "ax = sns.heatmap(accuracy, cmap=cmap, vmin=0, vmax=1, annot=True, linewidth=0.5)\n",
    "ax.set_xlabel('Context size')\n",
    "labels = [f'{ln}' if ln < 1000 else f'{int(ln / 1000)}k' for ln in message_lengths]\n",
    "ax.set_xticks(np.array(range(accuracy.shape[1])) + 0.5, labels)\n",
    "ax.tick_params(axis='y', rotation=90)\n",
    "ax.set_yticks([0], [''])\n",
    "ax.set_title('Performance of '+f'{model}'+' on BABILong benchmark task '+f'{task}'+', accuracy')\n",
    "plt.show()\n",
    "\n",
    "fig.savefig('fig.pdf', dpi=100)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "babilong",
   "language": "python",
   "name": "babilong"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
