{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Now calc scores on Babilong"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we reuse code from the original benchmark and pack it into single notebook.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "TASK_LABELS = {'qa1': ['bathroom', 'bedroom', 'garden', 'hallway', 'kitchen', 'office'], \n",
    " 'qa2': ['bathroom', 'bedroom', 'garden', 'hallway', 'kitchen', 'office'], \n",
    " 'qa3': ['bathroom', 'bedroom', 'garden', 'hallway', 'kitchen', 'office'], \n",
    " 'qa4': ['bathroom', 'bedroom', 'garden', 'hallway', 'kitchen', 'office'], \n",
    " 'qa5': ['Bill', 'Fred', 'Jeff', 'Mary', 'apple', 'football', 'milk'], \n",
    " 'qa6': ['no', 'yes'], \n",
    " 'qa7': ['none', 'one', 'three', 'two'], \n",
    " 'qa8': ['apple', 'football', 'milk', 'nothing'], \n",
    " 'qa9': ['no', 'yes'], \n",
    " 'qa10': ['maybe', 'no', 'yes'],\n",
    " 'qa11': ['bathroom', 'bedroom', 'garden', 'hallway', 'kitchen', 'office'], \n",
    " 'qa12': ['bathroom', 'bedroom', 'garden', 'hallway', 'kitchen', 'office'], \n",
    " 'qa13': ['bathroom', 'bedroom', 'garden', 'hallway', 'kitchen', 'office'], \n",
    " 'qa14': ['bedroom', 'cinema', 'kitchen', 'office', 'park', 'school'], \n",
    " 'qa15': ['cat', 'mouse', 'sheep', 'wolf'], \n",
    " 'qa16': ['gray', 'green', 'white', 'yellow'], \n",
    " 'qa17': ['no', 'yes'], \n",
    " 'qa18': ['no', 'yes'], \n",
    " 'qa19': ['e,e', 'e,n', 'e,s', 'n,e', 'n,n', 'n,w', 's,e', 's,s', 's,w', 'w,n', 'w,s', 'w,w'], \n",
    " 'qa20': ['bedroom', 'bored', 'garden', 'hungry', 'kitchen', 'thirsty', 'tired']\n",
    "}\n",
    "\n",
    "\n",
    "def preprocess_output(output):\n",
    "    output = output.lower()\n",
    "    # take only the first sentence from output\n",
    "    output = output.split('.')[0]\n",
    "    # filter responses when model tries to generate examples\n",
    "    output = output.split('<context>')[0]\n",
    "    output = output.split('<example>')[0]\n",
    "    output = output.split('Question')[0]\n",
    "    return output\n",
    "\n",
    "\n",
    "def preprocess_output_cot(output):\n",
    "    output = output.lower()\n",
    "    # take only the first sentence from output\n",
    "    splitted_output = output.split('.')\n",
    "    if len(splitted_output) == 1:\n",
    "        output = splitted_output[-1]\n",
    "    else:\n",
    "        output = splitted_output[-2]\n",
    "    # filter responses when model tries to generate examples\n",
    "    output = output.split('<context>')[0]\n",
    "    output = output.split('<example>')[0]\n",
    "    output = output.split('Question')[0]\n",
    "    return output\n",
    "\n",
    "\n",
    "def compare_answers(target, output, question, task_labels, cot_answer=False):\n",
    "    if cot_answer:\n",
    "        output = preprocess_output_cot(output)\n",
    "    else:\n",
    "        output = preprocess_output(output)\n",
    "    target = str(target).lower()\n",
    "    task_labels = {str(label).lower() for label in task_labels}\n",
    "\n",
    "    # extract labels that were mentioned in the question\n",
    "    labels_in_question = {label for label in task_labels if label in str(question).lower()}\n",
    "    # extract labels that were mentioned in the model output\n",
    "    labels_in_output = {label for label in task_labels if label in output}\n",
    "    # filter labels in the output to exclude mentioned in the question\n",
    "    # mentions in questions are never targets\n",
    "    labels_in_output = labels_in_output - labels_in_question\n",
    "\n",
    "    # check if the target is the only prediction\n",
    "    if ',' in target and len(target) > 3: \n",
    "        # if target contains multiple subtargets in qa8\n",
    "        subtargets = target.split(',')\n",
    "        num_subtargets = len(subtargets)\n",
    "        if all([t in labels_in_output for t in subtargets]) and len(labels_in_output) == num_subtargets:\n",
    "            return True\n",
    "    else:\n",
    "        if target in labels_in_output and len(labels_in_output) == 1:\n",
    "            return True\n",
    "\n",
    "    return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "SYSTEM_TEMPLATE = '{instruction}\\n\\n{examples}\\n\\n{post_prompt}'\n",
    "USER_TEMPLATE = '<context>\\n{context}\\n</context>\\n\\nQuestion: {question}'\n",
    "DEFAULT_TEMPLATE = f'{SYSTEM_TEMPLATE}\\n\\n{USER_TEMPLATE}'\n",
    "\n",
    "CUSTOM_SYSTEM_PROMPTS = {\n",
    "    # https://github.com/dvlab-research/LongLoRA/blob/2345c6d030f61ac3a031906386a103a5b05e0e6f/inference.py#L18\n",
    "    'LONGLORA_LLAMA2':\n",
    "        'You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. '\n",
    "        'Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. '\n",
    "        'Please ensure that your responses are socially unbiased and positive in nature.\\n\\n'\n",
    "        'If a question does not make any sense, or is not factually coherent, explain why instead of answering '\n",
    "        'something not correct. If you don\\'t know the answer to a question, please don\\'t share false information.'\n",
    "}\n",
    "\n",
    "\n",
    "def get_formatted_input(context, question, examples, instruction, post_prompt, template=DEFAULT_TEMPLATE):\n",
    "    # pre_prompt - general instruction\n",
    "    # examples - in-context examples\n",
    "    # post_prompt - any additional instructions after examples\n",
    "    # context - text to use for qa\n",
    "    # question - question to answer based on context\n",
    "    formatted_input = template.format(instruction=instruction, examples=examples, post_prompt=post_prompt,\n",
    "                                      context=context.strip(), question=question)\n",
    "    return formatted_input.strip()\n",
    "\n",
    "\n",
    "DEFAULT_PROMPTS = {\n",
    "    'qa1': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about positions of different persons hidden in some random text '\n",
    "            'and a question. You need to answer the question based only on the information from the facts. '\n",
    "            'If a person was in different locations, use the latest location to answer the question.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'Charlie went to the hallway. Judith come back to the kitchen. Charlie travelled to balcony. '\n",
    "            'Where is Charlie?\\n'\n",
    "            'Answer: The most recent location of Charlie is balcony.\\n'\n",
    "            '</example>\\n\\n'\n",
    "            '<example>\\n'\n",
    "            'Alan moved to the garage. Charlie went to the beach. Alan went to the shop. Rouse '\n",
    "            'travelled to balcony. Where is Alan?\\n'\n",
    "            'Answer: The most recent location of Alan is shop.\\n'\n",
    "            '</example>',\n",
    "        'post_prompt':\n",
    "            'Always return your answer in the following format: '\n",
    "            'The most recent location of ’person’ is ’location’. Do not write anything else after that.'\n",
    "    },\n",
    "    'qa2': {\n",
    "        'instruction':\n",
    "            'I give you context with the facts about locations and actions of different persons '\n",
    "            'hidden in some random text and a question.'\n",
    "            'You need to answer the question based only on the information from the facts.\\n'\n",
    "            'If a person got an item in the first location and travelled to the second location '\n",
    "            'the item is also in the second location. '\n",
    "            'If a person dropped an item in the first location and moved to the second location '\n",
    "            'the item remains in the first location.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'Charlie went to the kitchen. Charlie got a bottle. Charlie moved to the balcony. '\n",
    "            'Where is the bottle?\\n'\n",
    "            'Answer: The bottle is in the balcony.\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Alan moved to the garage. Alan got a screw driver. Alan moved to the kitchen. Where '\n",
    "            'is the screw driver?\\n'\n",
    "            'Answer: The screw driver is in the kitchen.\\n'\n",
    "            '</example>',\n",
    "        'post_prompt':\n",
    "            'Always return your answer in the following format: The ’item’ is in ’location’. '\n",
    "            'Do not write anything else after that.'\n",
    "    },\n",
    "    'qa3': {\n",
    "        'instruction':\n",
    "            'I give you context with the facts about locations and actions of different persons '\n",
    "            'hidden in some random text and a question. '\n",
    "            'You need to answer the question based only on the information from the facts.\\n'\n",
    "            'If a person got an item in the first location and travelled to the second location '\n",
    "            'the item is also in the second location. '\n",
    "            'If a person dropped an item in the first location and moved to the second location '\n",
    "            'the item remains in the first location.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'John journeyed to the bedroom. Mary grabbed the apple. Mary went back to the bathroom. '\n",
    "            'Daniel journeyed to the bedroom. Daniel moved to the garden. Mary travelled to the kitchen. '\n",
    "            'Where was the apple before the kitchen?\\n'\n",
    "            'Answer: Before the kitchen the apple was in the bathroom.\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'John went back to the bedroom. John went back to the garden. John went back to the kitchen. '\n",
    "            'Sandra took the football. Sandra travelled to the garden. Sandra journeyed to the bedroom. '\n",
    "            'Where was the football before the bedroom?\\n'\n",
    "            'Answer: Before the bedroom the football was in the garden.\\n'\n",
    "            '</example>',\n",
    "        'post_prompt':\n",
    "            'Always return your answer in the following format: '\n",
    "            'Before the $location_1$ the $item$ was in the $location_2$. Do not write anything else after that.'\n",
    "    },\n",
    "    'qa4': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about different people, their location and actions, hidden in '\n",
    "            'some random text and a question. '\n",
    "            'You need to answer the question based only on the information from the facts.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'The hallway is south of the kitchen. The bedroom is north of the kitchen. '\n",
    "            'What is the kitchen south of?\\n'\n",
    "            'Answer: bedroom\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'The garden is west of the bedroom. The bedroom is west of the kitchen. What is west of the bedroom?\\n'\n",
    "            'Answer: garden\\n'\n",
    "            '</example>',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one word - location. Do not write anything else after that.'\n",
    "    },\n",
    "    'qa5': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about locations and their relations hidden in some random text '\n",
    "            'and a question. You need to answer the question based only on the information from the facts.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'Mary picked up the apple there. Mary gave the apple to Fred. Mary moved to the bedroom. '\n",
    "            'Bill took the milk there. Who did Mary give the apple to?\\n'\n",
    "            'Answer: Fred\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Jeff took the football there. Jeff passed the football to Fred. Jeff got the milk there. '\n",
    "            'Bill travelled to the bedroom. Who gave the football?\\n'\n",
    "            'Answer: Jeff\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Fred picked up the apple there. Fred handed the apple to Bill. Bill journeyed to the bedroom. '\n",
    "            'Jeff went back to the garden. What did Fred give to Bill?\\n'\n",
    "            'Answer: apple\\n'\n",
    "            '</example>',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one word. Do not write anything else after that. '\n",
    "            'Do not explain your answer.'\n",
    "    },\n",
    "    'qa6': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about people and their locations hidden in some random text and a '\n",
    "            'question. You need to answer the question based only on the information from the facts. '\n",
    "            'If a person was in different locations, use the latest location the person was in to answer the question.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'John travelled to the hallway. John travelled to the garden. Is John in the garden?\\n'\n",
    "            'Answer: yes\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Mary went to the office. Daniel journeyed to the hallway. Mary went to the bedroom. '\n",
    "            'Sandra went to the garden. Is Mary in the office?\\n'\n",
    "            'Answer: no\\n'\n",
    "            '</example>\\n',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one word - $yes$ or $no$. Do not write anything else after that. '\n",
    "            'Do not explain your answer.'\n",
    "    },\n",
    "    'qa7': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about people and objects they carry, hidden in some random text '\n",
    "            'and a question. You need to answer the question based only on the information from the facts.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'Daniel went to the bedroom. Daniel got the apple there. How many objects is Daniel carrying?\\n'\n",
    "            'Answer: one\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Mary grabbed the apple there. Mary gave the apple to John. How many objects is Mary carrying?\\n'\n",
    "            'Answer: none\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Sandra travelled to the hallway. Sandra picked up the milk there. Sandra took the apple there. '\n",
    "            'Mary travelled to the garden. How many objects is Sandra carrying?\\n'\n",
    "            'Answer: two\\n'\n",
    "            '</example>\\n',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one word - $none$ or $number_of_objects$. '\n",
    "            'Do not write anything else after that. Do not explain your answer.',\n",
    "    },\n",
    "    'qa8': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about people and objects they carry, hidden in some random text '\n",
    "            'and a question. You need to answer the question based only on the information from the facts.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'Sandra travelled to the garden. Mary grabbed the milk there. What is Mary carrying?\\n'\n",
    "            'Answer: milk\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Mary travelled to the kitchen. Sandra travelled to the office. John travelled to the office. '\n",
    "            'Sandra discarded the milk there. What is Sandra carrying?\\n'\n",
    "            'Answer: nothing\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Daniel grabbed the apple there. Mary went to the office. Daniel moved to the garden. '\n",
    "            'Daniel grabbed the milk there. Mary went to the kitchen. What is Daniel carrying?\\n'\n",
    "            \"Answer: apple,milk\\n\"\n",
    "            \"</example>\\n\",\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one or two words: $nothing$ or $object$ or $object_1$, $object_2$. '\n",
    "            'Do not write anything else. Do not explain your answer.'\n",
    "    },\n",
    "    'qa9': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about people and their locations hidden in some random text and '\n",
    "            'a question. You need to answer the question based only on the information from the facts. '\n",
    "            'If a person was in different locations, use the latest location the person was in to answer the question.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'John is not in the bathroom. Sandra is not in the bedroom. Is John in the bathroom?\\n'\n",
    "            'Answer: no\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Mary journeyed to the kitchen. John is in the bedroom. Sandra is not in the garden. '\n",
    "            'Is Mary in the kitchen?\\n'\n",
    "            'Answer: yes\\n'\n",
    "            '</example>\\n',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one word - $yes$ or $no$. Do not write anything else. '\n",
    "            'Do not explain your answer.'\n",
    "    },\n",
    "    'qa10': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about people and their locations hidden in some random text and a '\n",
    "            'question. You need to answer the question based only on the information from the facts. '\n",
    "            'If a person was in different locations, use the latest location the person was in to answer the question.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'Bill is in the kitchen. Julie is either in the school or the cinema. Is Bill in the bedroom?\\n'\n",
    "            'Answer: no\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Fred is in the bedroom. Mary is either in the school or the cinema. Is Mary in the school?\\n'\n",
    "            'Answer: maybe\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Fred is either in the kitchen or the park. Bill moved to the cinema. Is Bill in the cinema?\\n'\n",
    "            'Answer: yes\\n'\n",
    "            '</example>\\n'\n",
    "            '<context>\\n',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one word - $yes$ or $no$ or $maybe$. Do not write anything else. '\n",
    "            'Do not explain your answer.'\n",
    "    },\n",
    "    'qa11': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about people and their locations hidden in some random text and a '\n",
    "            'question. You need to answer the question based only on the information from the facts. '\n",
    "            'If a person was in different locations, use the latest location the person was in to answer the question.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'Daniel journeyed to the hallway. After that he journeyed to the garden. Where is Daniel?\\n'\n",
    "            'Answer: garden\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Mary moved to the office. Afterwards she journeyed to the kitchen. Daniel went to the hallway. '\n",
    "            'Then he journeyed to the garden. Where is Mary?\\n'\n",
    "            'Answer: kitchen\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Sandra moved to the kitchen. After that she went back to the hallway. Sandra moved to the bedroom. '\n",
    "            'Then she went to the hallway. Mary moved to the bedroom. Afterwards she travelled to the bathroom. '\n",
    "            'Where is Sandra?\\n'\n",
    "            'Answer: hallway\\n'\n",
    "            '</example>\\n'\n",
    "            '<context>\\n',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one word - location. Do not write anything else after that. '\n",
    "            'Do not explain your answer.'\n",
    "    },\n",
    "    'qa12': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about people and their locations hidden in some random text and a '\n",
    "            'question. You need to answer the question based only on the information from the facts. '\n",
    "            'If a person was in different locations, use the latest location the person was in to answer the question.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'Mary and Daniel travelled to the bathroom. John and Daniel travelled to the office. Where is Daniel?\\n'\n",
    "            'Answer: office\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Sandra and Mary went back to the office. Daniel and Sandra went to the bedroom. Sandra and Mary travelled to the hallway. '\n",
    "            'John and Mary went to the kitchen. Where is Mary?\\n'\n",
    "            'Answer: kitchen\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Daniel and Sandra went back to the hallway. Daniel and John moved to the office. Daniel and John moved to the garden. '\n",
    "            'Daniel and Mary went back to the bathroom. Daniel and John went back to the kitchen. Daniel and Sandra went to the bathroom. '\n",
    "            'Where is John?\\n'\n",
    "            'Answer: kitchen\\n'\n",
    "            '</example>\\n'\n",
    "            '<context>\\n',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one word - location. Do not write anything else after that. '\n",
    "            'Do not explain your answer.'\n",
    "    },\n",
    "    'qa13': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about people and their locations hidden in some random text and a '\n",
    "            'question. You need to answer the question based only on the information from the facts. '\n",
    "            'If a person was in different locations, use the latest location the person was in to answer the question.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'Mary and Daniel travelled to the bathroom. Then they journeyed to the hallway. Where is Daniel?\\n'\n",
    "            'Answer: hallway\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Daniel and Sandra travelled to the kitchen. After that they journeyed to the hallway. Mary and Daniel travelled to the bedroom. '\n",
    "            'After that they travelled to the hallway. Where is Sandra?\\n'\n",
    "            'Answer: hallway\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'John and Mary moved to the bathroom. Then they travelled to the office. John and Mary went to the kitchen. '\n",
    "            'Afterwards they went to the bedroom. John and Sandra moved to the bathroom. Following that they went back to the kitchen. '\n",
    "            'Where is Mary?\\n'\n",
    "            'Answer: bedroom\\n'\n",
    "            '</example>\\n'\n",
    "            '<context>\\n',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one word - location. Do not write anything else after that. '\n",
    "            'Do not explain your answer.'\n",
    "    },\n",
    "    'qa14': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about people and their locations hidden in some random text and a '\n",
    "            'question. You need to answer the question based only on the information from the facts. '\n",
    "            'If a person was in different locations, use the latest location the person was in to answer the question.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'Bill went back to the cinema yesterday. Julie went to the school this morning. Fred went to the park yesterday. '\n",
    "            'Yesterday Julie went to the office. Where was Julie before the school?\\n'\n",
    "            'Answer: office\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'This morning Fred went to the kitchen. Fred journeyed to the bedroom yesterday. Mary travelled to the bedroom this morning. '\n",
    "            'Yesterday Mary went to the cinema. Where was Mary before the bedroom?\\n'\n",
    "            'Answer: cinema\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Yesterday Julie went back to the park. Julie went to the bedroom this morning. Bill journeyed to the cinema yesterday. '\n",
    "            'This morning Bill went back to the park. This evening Julie went to the school. This afternoon Julie went back to the park. '\n",
    "            'Where was Julie before the bedroom?\\n'\n",
    "            'Answer: park\\n'\n",
    "            '</example>\\n'\n",
    "            '<context>\\n',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one word - location. Do not write anything else after that. '\n",
    "            'Do not explain your answer.'\n",
    "    },\n",
    "    'qa15': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about animals, their names and relations. The facts and a question '\n",
    "            'are hidden in some random text. You need to answer the question based only on the information from the facts.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'Mice are afraid of wolves. Gertrude is a mouse. Cats are afraid of sheep. '\n",
    "            'Winona is a mouse. Sheep are afraid of wolves. Emily is a mouse. Jessica is a wolf. '\n",
    "            'What is gertrude afraid of?\\n'\n",
    "            'Answer: wolf\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Mice are afraid of wolves. Gertrude is a mouse. Cats are afraid of sheep. '\n",
    "            'Winona is a mouse. Sheep are afraid of wolves. Emily is a mouse. Jessica is a wolf. '\n",
    "            'What is jessica afraid of?\\n'\n",
    "            'Answer: cat\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Mice are afraid of cats. Wolves are afraid of sheep. Emily is a wolf. '\n",
    "            'Cats are afraid of sheep. Gertrude is a wolf. Sheep are afraid of cats. Winona is a wolf. '\n",
    "            'What is emily afraid of?\\n'\n",
    "            'Answer: sheep\\n'\n",
    "            '</example>\\n'\n",
    "            '<context>\\n',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one word - an animal species. Do not write anything else after that. '\n",
    "            'Do not explain your answer.'\n",
    "    },\n",
    "    'qa16': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about animals, their names and colors. The facts and a question '\n",
    "            'are hidden in some random text. You need to answer the question based only on the information from the facts.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'Lily is a frog. Bernhard is a frog. Bernhard is green. Brian is a lion. Brian is white. '\n",
    "            'Julius is a swan. Julius is green. Lily is green. Greg is a swan. What color is Greg?\\n'\n",
    "            'Answer: green\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Julius is a lion. Lily is a rhino. Bernhard is a swan. Lily is white. Bernhard is green. '\n",
    "            'Greg is a rhino. Greg is gray. Julius is white. Brian is a lion. What color is Brian?\\n'\n",
    "            'Answer: white\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Brian is a rhino. Julius is a lion. Bernhard is a lion. Greg is a swan. Brian is gray. '\n",
    "            'Greg is white. Lily is a rhino. Bernhard is yellow. Lily is gray. What color is Julius?\\n'\n",
    "            'Answer: yellow\\n'\n",
    "            '</example>\\n'\n",
    "            '<context>\\n',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one word - a color. Do not write anything else after that. '\n",
    "            'Do not explain your answer.'\n",
    "    },\n",
    "    'qa17': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about different figures, their location and colors, hidden in '\n",
    "            'some random text and a question. '\n",
    "            'You need to answer the question based only on the information from the facts.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'The triangle is above the pink rectangle. The blue square is to the left of the triangle. '\n",
    "            'Is the pink rectangle to the right of the blue square?\\n'\n",
    "            'Answer: yes\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'The red sphere is to the left of the yellow square. The red sphere is below the pink rectangle. '\n",
    "            'Is the pink rectangle to the left of the yellow square?\\n'\n",
    "            'Answer: yes\\n'\n",
    "            '</example>'\n",
    "            '<example>\\n'\n",
    "            'The red sphere is above the pink rectangle. The red sphere is to the right of the red square. '\n",
    "            'Is the pink rectangle above the red square?\\n'\n",
    "            'Answer: no\\n'\n",
    "            '</example>',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one word - $yes$ or $no$. Do not write anything else. '\n",
    "            'Do not explain your answer.'\n",
    "    },\n",
    "    'qa18': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about different objects and their sizes, hidden in '\n",
    "            'some random text and a question. '\n",
    "            'You need to answer the question based only on the information from the facts.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'The box of chocolates fits inside the chest. The box is bigger than the chest. The box is bigger than the suitcase. '\n",
    "            'The suitcase fits inside the box. The container is bigger than the box of chocolates. Does the box fit in the box of chocolates?\\n'\n",
    "            'Answer: no\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'The suitcase is bigger than the container. The container fits inside the box. The chest is bigger than the chocolate.'\n",
    "            'The suitcase fits inside the box. The chest fits inside the box. Does the chocolate fit in the box?\\n'\n",
    "            'Answer: yes\\n'\n",
    "            '</example>'\n",
    "            '<example>\\n'\n",
    "            'The chocolate fits inside the box of chocolates. The suitcase fits inside the box. The chocolate fits inside the box. '\n",
    "            'The box is bigger than the box of chocolates. The suitcase is bigger than the box of chocolates. Is the chocolate bigger than the box?\\n'\n",
    "            'Answer: no\\n'\n",
    "            '</example>',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one word - $yes$ or $no$. Do not write anything else. '\n",
    "            'Do not explain your answer.'\n",
    "    },\n",
    "    'qa19': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about different places and their locations, hidden in '\n",
    "            'some random text and a question. '\n",
    "            'You need to answer the question based only on the information from the facts.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'The office is east of the hallway. The kitchen is north of the office. The garden is west of the bedroom. '\n",
    "            'The office is west of the garden. The bathroom is north of the garden. How do you go from the kitchen to the garden?\\n'\n",
    "            'Answer: s,e\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'The bedroom is west of the hallway. The office is east of the garden. The garden is north of the kitchen. '\n",
    "            'The kitchen is north of the bathroom. The hallway is west of the garden. How do you go from the kitchen to the hallway?\\n'\n",
    "            'Answer: n,w\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'The bedroom is south of the hallway. The bathroom is east of the office. The kitchen is west of the garden. '\n",
    "            'The garden is south of the office. The office is south of the bedroom. How do you go from the garden to the bedroom?\\n'\n",
    "            'Answer: n,n\\n'\n",
    "            '</example>\\n',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only two letters, separated by a comma - ordinal directions. You can choose the letters from '\n",
    "             '$n$, $s$, $e$ and $w$. Do not write anything else after that.'\n",
    "    },\n",
    "    'qa20': {\n",
    "        'instruction':\n",
    "            'I will give you context with the facts about people, their locations and condition hidden in some random text and a '\n",
    "            'question. You need to answer the question based only on the information from the facts. '\n",
    "            'If a person was in different locations, use the latest location the person was in to answer the question.',\n",
    "        'examples':\n",
    "            '<example>\\n'\n",
    "            'Sumit is tired. Where will sumit go?\\n'\n",
    "            'Answer: bedroom\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Yann is hungry. Yann journeyed to the kitchen. Why did yann go to the kitchen?\\n'\n",
    "            'Answer: hungry\\n'\n",
    "            '</example>\\n'\n",
    "            '<example>\\n'\n",
    "            'Antoine is thirsty. Yann is tired. Yann went back to the bedroom. Yann picked up the pajamas there.'\n",
    "            'Jason is thirsty. Antoine went back to the kitchen. Why did antoine go to the kitchen?\\n'\n",
    "            'Answer: thirsty\\n'\n",
    "            '</example>\\n'\n",
    "            '<context>\\n',\n",
    "        'post_prompt':\n",
    "            'Your answer should contain only one word - a person condition or a place. Do not write anything else after that. '\n",
    "            'Do not explain your answer.'\n",
    "    }\n",
    "    \n",
    "\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style type=\"text/css\">\n",
       "#T_ad344 tr {\n",
       "  line-height: 15px;\n",
       "}\n",
       "#T_ad344 td {\n",
       "  line-height: inherit;\n",
       "  padding: 5px;\n",
       "}\n",
       "#T_ad344 th {\n",
       "  line-height: inherit;\n",
       "  padding: 5px;\n",
       "}\n",
       "#T_ad344 .row7 {\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_ad344 .row15 {\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_ad344 .row-1 {\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_ad344 .row-1 {\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_ad344 .row-1 {\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_ad344_row0_col0, #T_ad344_row0_col1, #T_ad344_row1_col0, #T_ad344_row1_col1, #T_ad344_row2_col0, #T_ad344_row2_col1, #T_ad344_row3_col0, #T_ad344_row3_col1, #T_ad344_row4_col0, #T_ad344_row4_col1, #T_ad344_row5_col0, #T_ad344_row5_col1, #T_ad344_row6_col0, #T_ad344_row6_col1, #T_ad344_row8_col0, #T_ad344_row8_col1, #T_ad344_row9_col0, #T_ad344_row9_col1, #T_ad344_row10_col0, #T_ad344_row10_col1, #T_ad344_row11_col0, #T_ad344_row11_col1, #T_ad344_row12_col1 {\n",
       "  background-color: #00441b;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_ad344_row7_col0 {\n",
       "  background-color: #7dccb5;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_ad344_row7_col1 {\n",
       "  background-color: #84cfb9;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_ad344_row12_col0 {\n",
       "  background-color: #00491d;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_ad344_row13_col0, #T_ad344_row13_col1 {\n",
       "  background-color: #005020;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_ad344_row14_col0, #T_ad344_row14_col1 {\n",
       "  background-color: #00682a;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_ad344_row15_col0 {\n",
       "  background-color: #f5fbfc;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_ad344_row15_col1 {\n",
       "  background-color: #f7fcfd;\n",
       "  color: #000000;\n",
       "}\n",
       "</style>\n",
       "<table id=\"T_ad344\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th id=\"T_ad344_level0_col0\" class=\"col_heading level0 col0\" >Original-ARMT-Llama-3.2-1B-Instruct - Time, sec.</th>\n",
       "      <th id=\"T_ad344_level0_col1\" class=\"col_heading level0 col1\" >Optimized-ARMT-Llama-3.2-1B-Instruct - Time, sec.</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row0\" class=\"row_heading level0 row0\" >qa1_0k</th>\n",
       "      <td id=\"T_ad344_row0_col0\" class=\"data row0 col0\" >100.00</td>\n",
       "      <td id=\"T_ad344_row0_col1\" class=\"data row0 col1\" >100.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row1\" class=\"row_heading level0 row1\" >qa1_1k</th>\n",
       "      <td id=\"T_ad344_row1_col0\" class=\"data row1 col0\" >100.00</td>\n",
       "      <td id=\"T_ad344_row1_col1\" class=\"data row1 col1\" >100.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row2\" class=\"row_heading level0 row2\" >qa1_2k</th>\n",
       "      <td id=\"T_ad344_row2_col0\" class=\"data row2 col0\" >100.00</td>\n",
       "      <td id=\"T_ad344_row2_col1\" class=\"data row2 col1\" >100.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row3\" class=\"row_heading level0 row3\" >qa1_4k</th>\n",
       "      <td id=\"T_ad344_row3_col0\" class=\"data row3 col0\" >100.00</td>\n",
       "      <td id=\"T_ad344_row3_col1\" class=\"data row3 col1\" >100.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row4\" class=\"row_heading level0 row4\" >qa1_8k</th>\n",
       "      <td id=\"T_ad344_row4_col0\" class=\"data row4 col0\" >100.00</td>\n",
       "      <td id=\"T_ad344_row4_col1\" class=\"data row4 col1\" >100.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row5\" class=\"row_heading level0 row5\" >qa1_16k</th>\n",
       "      <td id=\"T_ad344_row5_col0\" class=\"data row5 col0\" >100.00</td>\n",
       "      <td id=\"T_ad344_row5_col1\" class=\"data row5 col1\" >100.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row6\" class=\"row_heading level0 row6\" >qa1_32k</th>\n",
       "      <td id=\"T_ad344_row6_col0\" class=\"data row6 col0\" >100.00</td>\n",
       "      <td id=\"T_ad344_row6_col1\" class=\"data row6 col1\" >100.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row7\" class=\"row_heading level0 row7\" >qa1_64k</th>\n",
       "      <td id=\"T_ad344_row7_col0\" class=\"data row7 col0\" >70.00</td>\n",
       "      <td id=\"T_ad344_row7_col1\" class=\"data row7 col1\" >69.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row8\" class=\"row_heading level0 row8\" >qa2_0k</th>\n",
       "      <td id=\"T_ad344_row8_col0\" class=\"data row8 col0\" >100.00</td>\n",
       "      <td id=\"T_ad344_row8_col1\" class=\"data row8 col1\" >100.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row9\" class=\"row_heading level0 row9\" >qa2_1k</th>\n",
       "      <td id=\"T_ad344_row9_col0\" class=\"data row9 col0\" >100.00</td>\n",
       "      <td id=\"T_ad344_row9_col1\" class=\"data row9 col1\" >100.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row10\" class=\"row_heading level0 row10\" >qa2_2k</th>\n",
       "      <td id=\"T_ad344_row10_col0\" class=\"data row10 col0\" >100.00</td>\n",
       "      <td id=\"T_ad344_row10_col1\" class=\"data row10 col1\" >100.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row11\" class=\"row_heading level0 row11\" >qa2_4k</th>\n",
       "      <td id=\"T_ad344_row11_col0\" class=\"data row11 col0\" >100.00</td>\n",
       "      <td id=\"T_ad344_row11_col1\" class=\"data row11 col1\" >100.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row12\" class=\"row_heading level0 row12\" >qa2_8k</th>\n",
       "      <td id=\"T_ad344_row12_col0\" class=\"data row12 col0\" >99.00</td>\n",
       "      <td id=\"T_ad344_row12_col1\" class=\"data row12 col1\" >100.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row13\" class=\"row_heading level0 row13\" >qa2_16k</th>\n",
       "      <td id=\"T_ad344_row13_col0\" class=\"data row13 col0\" >98.00</td>\n",
       "      <td id=\"T_ad344_row13_col1\" class=\"data row13 col1\" >98.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row14\" class=\"row_heading level0 row14\" >qa2_32k</th>\n",
       "      <td id=\"T_ad344_row14_col0\" class=\"data row14 col0\" >94.00</td>\n",
       "      <td id=\"T_ad344_row14_col1\" class=\"data row14 col1\" >94.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_ad344_level0_row15\" class=\"row_heading level0 row15\" >qa2_64k</th>\n",
       "      <td id=\"T_ad344_row15_col0\" class=\"data row15 col0\" >47.00</td>\n",
       "      <td id=\"T_ad344_row15_col1\" class=\"data row15 col1\" >46.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x7f58efae1de0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# extract results into table\n",
    "results_folder = './test_res'\n",
    "mod_names = [\n",
    "    \"unsloth/Llama-3.2-1B-Instruct/orig_v2_model_paper_armt-1b-it-v2\",\n",
    "    \"unsloth/Llama-3.2-1B-Instruct/fast_executor_from_orig_paper_time_fix_v2_mem_patch_armt-1b-it-v2\",\n",
    "]\n",
    "disp_names = [\n",
    "    \"Original-ARMT-Llama-3.2-1B-Instruct - Time, sec.\",\n",
    "    \"Optimized-ARMT-Llama-3.2-1B-Instruct - Time, sec.\",\n",
    "]\n",
    "\n",
    "all_dfs = []\n",
    "for dat_idx, mod_name in enumerate(mod_names):\n",
    "    overall_results = {}\n",
    "    disp_name = disp_names[dat_idx]\n",
    "    prompt_name = 'instruction_yes_examples_yes_post_prompt_yes_chat_template_yes'\n",
    "    all_names = [\n",
    "        [f'./{mod_name}']]\n",
    "    names = all_names[0]\n",
    "    for model_name in names:\n",
    "        overall_results[disp_names[dat_idx]] = {}\n",
    "        tasks = ['qa1', 'qa2']#, 'qa3', 'qa4', 'qa5']\n",
    "        lengths = ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k']\n",
    "        accuracy = np.zeros((len(tasks), len(lengths)))\n",
    "        for j, task in enumerate(tasks):\n",
    "            for i, ctx_length in enumerate(lengths):\n",
    "                fname = f'./{results_folder}/{model_name}/{task}_{ctx_length}_{prompt_name}.csv'\n",
    "                if not os.path.isfile(fname):\n",
    "                    print(f'No such file: {fname}')\n",
    "                    continue\n",
    "                df = pd.read_csv(fname)\n",
    "                if df['output'].dtype != object:\n",
    "                    df['output'] = df['output'].astype(str)\n",
    "                df['output'] = df['output'].fillna('')\n",
    "                df['correct'] = df.apply(lambda row: compare_answers(row['target'], row['output'],\n",
    "                                                                     row['question'], TASK_LABELS[task]\n",
    "                                                                     ), axis=1)\n",
    "                score = df['correct'].sum()\n",
    "                overall_results[disp_names[dat_idx]][task + \"_\" + ctx_length] = float(np.round(100 * score / len(df) if len(df) > 0 else 0, 2))\n",
    "                accuracy[j, i] = 100 * score / len(df) if len(df) > 0 else 0\n",
    "    overall_results = {key.split(\"/\")[-1]:value for key,value in overall_results.items()}\n",
    "    mod_df = pd.DataFrame.from_dict(overall_results)\n",
    "    all_dfs.append(mod_df)\n",
    "full_df = pd.concat(all_dfs, axis=1)\n",
    "from matplotlib.colors import LinearSegmentedColormap\n",
    "cmap = LinearSegmentedColormap.from_list('ryg', [\"red\", \"yellow\", \"green\"], N=256)\n",
    "cmap = \"BuGn\"\n",
    "s = full_df.style\n",
    "s.set_table_styles([\n",
    "    {\"selector\": \"tr\", \"props\": \"line-height: 15px;\"},\n",
    "    {\"selector\": \"td,th\", \"props\": \"line-height: inherit; padding: 5px;\"}\n",
    "])\n",
    "for l0 in ['qa1_64k', 'qa2_64k', 'qa3_16k', 'qa4_16k', 'qa5_16k']:\n",
    "    s.set_table_styles({l0: [{'selector': '', 'props': 'border-bottom: 3px solid black;'}]}, overwrite=False, axis=1)\n",
    "fres = s.format(precision=2).background_gradient(axis=None, cmap=cmap)\n",
    "from pandas import option_context\n",
    "\n",
    "with option_context('display.max_colwidth', 20):\n",
    "    display(fres)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style type=\"text/css\">\n",
       "#T_255f4 tr {\n",
       "  line-height: 15px;\n",
       "}\n",
       "#T_255f4 td {\n",
       "  line-height: inherit;\n",
       "  padding: 5px;\n",
       "}\n",
       "#T_255f4 th {\n",
       "  line-height: inherit;\n",
       "  padding: 5px;\n",
       "}\n",
       "#T_255f4 .row5 {\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_255f4 .row11 {\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_255f4 .row-1 {\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_255f4 .row-1 {\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "#T_255f4 .row-1 {\n",
       "  border-bottom: 3px solid black;\n",
       "}\n",
       "</style>\n",
       "<table id=\"T_255f4\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th id=\"T_255f4_level0_col0\" class=\"col_heading level0 col0\" >Original-ARMT-Llama-3.2-1B-Instruct - Time, sec.</th>\n",
       "      <th id=\"T_255f4_level0_col1\" class=\"col_heading level0 col1\" >Optimized-ARMT-Llama-3.2-1B-Instruct - Time, sec.</th>\n",
       "      <th id=\"T_255f4_level0_col2\" class=\"col_heading level0 col2\" >Speedup</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_255f4_level0_row0\" class=\"row_heading level0 row0\" >qa1_2k</th>\n",
       "      <td id=\"T_255f4_row0_col0\" class=\"data row0 col0\" >13.43</td>\n",
       "      <td id=\"T_255f4_row0_col1\" class=\"data row0 col1\" >15.06</td>\n",
       "      <td id=\"T_255f4_row0_col2\" class=\"data row0 col2\" >0.89</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_255f4_level0_row1\" class=\"row_heading level0 row1\" >qa1_4k</th>\n",
       "      <td id=\"T_255f4_row1_col0\" class=\"data row1 col0\" >22.45</td>\n",
       "      <td id=\"T_255f4_row1_col1\" class=\"data row1 col1\" >17.99</td>\n",
       "      <td id=\"T_255f4_row1_col2\" class=\"data row1 col2\" >1.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_255f4_level0_row2\" class=\"row_heading level0 row2\" >qa1_8k</th>\n",
       "      <td id=\"T_255f4_row2_col0\" class=\"data row2 col0\" >41.41</td>\n",
       "      <td id=\"T_255f4_row2_col1\" class=\"data row2 col1\" >22.49</td>\n",
       "      <td id=\"T_255f4_row2_col2\" class=\"data row2 col2\" >1.84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_255f4_level0_row3\" class=\"row_heading level0 row3\" >qa1_16k</th>\n",
       "      <td id=\"T_255f4_row3_col0\" class=\"data row3 col0\" >79.16</td>\n",
       "      <td id=\"T_255f4_row3_col1\" class=\"data row3 col1\" >33.12</td>\n",
       "      <td id=\"T_255f4_row3_col2\" class=\"data row3 col2\" >2.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_255f4_level0_row4\" class=\"row_heading level0 row4\" >qa1_32k</th>\n",
       "      <td id=\"T_255f4_row4_col0\" class=\"data row4 col0\" >153.68</td>\n",
       "      <td id=\"T_255f4_row4_col1\" class=\"data row4 col1\" >54.20</td>\n",
       "      <td id=\"T_255f4_row4_col2\" class=\"data row4 col2\" >2.84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_255f4_level0_row5\" class=\"row_heading level0 row5\" >qa1_64k</th>\n",
       "      <td id=\"T_255f4_row5_col0\" class=\"data row5 col0\" >302.15</td>\n",
       "      <td id=\"T_255f4_row5_col1\" class=\"data row5 col1\" >94.36</td>\n",
       "      <td id=\"T_255f4_row5_col2\" class=\"data row5 col2\" >3.20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_255f4_level0_row6\" class=\"row_heading level0 row6\" >qa2_2k</th>\n",
       "      <td id=\"T_255f4_row6_col0\" class=\"data row6 col0\" >13.08</td>\n",
       "      <td id=\"T_255f4_row6_col1\" class=\"data row6 col1\" >14.93</td>\n",
       "      <td id=\"T_255f4_row6_col2\" class=\"data row6 col2\" >0.88</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_255f4_level0_row7\" class=\"row_heading level0 row7\" >qa2_4k</th>\n",
       "      <td id=\"T_255f4_row7_col0\" class=\"data row7 col0\" >22.66</td>\n",
       "      <td id=\"T_255f4_row7_col1\" class=\"data row7 col1\" >18.21</td>\n",
       "      <td id=\"T_255f4_row7_col2\" class=\"data row7 col2\" >1.24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_255f4_level0_row8\" class=\"row_heading level0 row8\" >qa2_8k</th>\n",
       "      <td id=\"T_255f4_row8_col0\" class=\"data row8 col0\" >41.66</td>\n",
       "      <td id=\"T_255f4_row8_col1\" class=\"data row8 col1\" >22.70</td>\n",
       "      <td id=\"T_255f4_row8_col2\" class=\"data row8 col2\" >1.84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_255f4_level0_row9\" class=\"row_heading level0 row9\" >qa2_16k</th>\n",
       "      <td id=\"T_255f4_row9_col0\" class=\"data row9 col0\" >79.80</td>\n",
       "      <td id=\"T_255f4_row9_col1\" class=\"data row9 col1\" >33.38</td>\n",
       "      <td id=\"T_255f4_row9_col2\" class=\"data row9 col2\" >2.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_255f4_level0_row10\" class=\"row_heading level0 row10\" >qa2_32k</th>\n",
       "      <td id=\"T_255f4_row10_col0\" class=\"data row10 col0\" >153.82</td>\n",
       "      <td id=\"T_255f4_row10_col1\" class=\"data row10 col1\" >53.46</td>\n",
       "      <td id=\"T_255f4_row10_col2\" class=\"data row10 col2\" >2.88</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_255f4_level0_row11\" class=\"row_heading level0 row11\" >qa2_64k</th>\n",
       "      <td id=\"T_255f4_row11_col0\" class=\"data row11 col0\" >303.40</td>\n",
       "      <td id=\"T_255f4_row11_col1\" class=\"data row11 col1\" >94.69</td>\n",
       "      <td id=\"T_255f4_row11_col2\" class=\"data row11 col2\" >3.20</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x7f58ee26cac0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# extract results into table\n",
    "import json\n",
    "results_folder = './test_res'\n",
    "\n",
    "\n",
    "mod_names = [\n",
    "    \"unsloth/Llama-3.2-1B-Instruct/orig_v2_model_paper_armt-1b-it-v2\",\n",
    "    \"unsloth/Llama-3.2-1B-Instruct/fast_executor_from_orig_paper_time_fix_v2_mem_patch_armt-1b-it-v2\",\n",
    "    \n",
    "]\n",
    "disp_names = [\n",
    "    \"Original-ARMT-Llama-3.2-1B-Instruct - Time, sec.\",\n",
    "    \"Optimized-ARMT-Llama-3.2-1B-Instruct - Time, sec.\",\n",
    "]\n",
    "\n",
    "all_dfs = []\n",
    "for dat_idx, mod_name in enumerate(mod_names):\n",
    "    overall_results = {}\n",
    "    disp_name = disp_names[dat_idx]\n",
    "    prompt_name = 'instruction_yes_examples_yes_post_prompt_yes_chat_template_yes'\n",
    "    all_names = [\n",
    "        [f'./{mod_name}']]\n",
    "    names = all_names[0]\n",
    "    for model_name in names:\n",
    "        overall_results[disp_names[dat_idx]] = {}\n",
    "        tasks = ['qa1', 'qa2']\n",
    "        lengths = ['2k', '4k', '8k', '16k', '32k', '64k']\n",
    "        accuracy = np.zeros((len(tasks), len(lengths)))\n",
    "        for j, task in enumerate(tasks):\n",
    "            for i, ctx_length in enumerate(lengths):\n",
    "                fname = f'./{results_folder}/{model_name}/time_{task}_{ctx_length}_{prompt_name}.json'\n",
    "                if not os.path.isfile(fname):\n",
    "                    print(f'No such file: {fname}')\n",
    "                    continue\n",
    "                with open(fname, \"r\") as f:\n",
    "                    df = json.load(f)\n",
    "                overall_results[disp_names[dat_idx]][task + \"_\" + ctx_length] = np.round(float(df[task][ctx_length]), 2)#float(np.round(100 * score / len(df) if len(df) > 0 else 0, 2))\n",
    "                accuracy[j, i] = 100 * score / len(df) if len(df) > 0 else 0\n",
    "    overall_results = {key.split(\"/\")[-1]:value for key,value in overall_results.items()}\n",
    "    mod_df = pd.DataFrame.from_dict(overall_results)\n",
    "    all_dfs.append(mod_df)\n",
    "full_df = pd.concat(all_dfs, axis=1)\n",
    "full_df[\"Speedup\"] = full_df[\"Original-ARMT-Llama-3.2-1B-Instruct - Time, sec.\"] / full_df[\"Optimized-ARMT-Llama-3.2-1B-Instruct - Time, sec.\"]\n",
    "s = full_df.style\n",
    "s.set_table_styles([\n",
    "    {\"selector\": \"tr\", \"props\": \"line-height: 15px;\"},\n",
    "    {\"selector\": \"td,th\", \"props\": \"line-height: inherit; padding: 5px;\"}\n",
    "])\n",
    "for l0 in ['qa1_64k', 'qa2_64k', 'qa3_16k', 'qa4_16k', 'qa5_16k']:\n",
    "    s.set_table_styles({l0: [{'selector': '', 'props': 'border-bottom: 3px solid black;'}]}, overwrite=False, axis=1)\n",
    "fres = s.format(precision=2)\n",
    "from pandas import option_context\n",
    "\n",
    "with option_context('display.max_colwidth', 20):\n",
    "    display(fres)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:conda_rmt_it_venv]",
   "language": "python",
   "name": "conda-env-conda_rmt_it_venv-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
