{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c4c3dffb-98fa-4d14-98ef-80c38d290e1b",
   "metadata": {},
   "source": [
    "# GPT-3.5-Turbo on GSM8K"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "2f7c15ca-974a-49aa-8212-d9b6730df39a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import openai\n",
    "import re\n",
    "import time\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "from tqdm import tqdm\n",
    "from datasets import load_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b63ddea4-8a1a-4629-9aa6-efdd47bf5e4a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "openai.api_key = \"sk-\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6032026c-70fc-4ba8-b353-fd3ecf78eb00",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading builder script: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 4.47k/4.47k [00:00<00:00, 345kB/s]\n",
      "Downloading metadata: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.67k/3.67k [00:00<00:00, 2.56MB/s]\n",
      "Downloading readme: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.79k/6.79k [00:00<00:00, 3.36MB/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset gsm8k/main to /Users/bytedance/.cache/huggingface/datasets/gsm8k/main/1.1.0/37bfb08b1d4fcbb01f06b03d9e1ef5f1fcbd4d3af3d08842c50d7305091285ba...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading data files:   0%|                                                                                                                      | 0/2 [00:00<?, ?it/s]\n",
      "Downloading data:   0%|                                                                                                                      | 0.00/1.34M [00:00<?, ?B/s]\u001b[A\n",
      "Downloading data:   3%|███▏                                                                                                          | 38.7k/1.34M [00:00<00:05, 247kB/s]\u001b[A\n",
      "Downloading data:   7%|████████                                                                                                      | 99.0k/1.34M [00:00<00:04, 291kB/s]\u001b[A\n",
      "Downloading data:  19%|████████████████████▋                                                                                          | 250k/1.34M [00:00<00:02, 532kB/s]\u001b[A\n",
      "Downloading data:  23%|█████████████████████████                                                                                      | 303k/1.34M [00:00<00:01, 530kB/s]\u001b[A\n",
      "Downloading data:  37%|█████████████████████████████████████████▌                                                                     | 503k/1.34M [00:01<00:01, 527kB/s]\u001b[A\n",
      "Downloading data:  76%|██████████████████████████████████████████████████████████████████████████████████▍                          | 1.02M/1.34M [00:01<00:00, 1.14MB/s]\u001b[A\n",
      "Downloading data:  85%|████████████████████████████████████████████████████████████████████████████████████████████▍                | 1.14M/1.34M [00:01<00:00, 1.01MB/s]\u001b[A\n",
      "Downloading data: 1.40MB [00:01, 991kB/s]                                                                                                                                \u001b[A\n",
      "Downloading data: 1.60MB [00:02, 784kB/s]\u001b[A\n",
      "Downloading data: 1.96MB [00:02, 1.04MB/s]\u001b[A\n",
      "Downloading data: 2.16MB [00:02, 1.04MB/s]\u001b[A\n",
      "Downloading data: 2.42MB [00:02, 1.00MB/s]\u001b[A\n",
      "Downloading data: 2.62MB [00:03, 806kB/s] \u001b[A\n",
      "Downloading data: 2.93MB [00:03, 982kB/s]\u001b[A\n",
      "Downloading data: 3.12MB [00:03, 981kB/s]\u001b[A\n",
      "Downloading data: 3.27MB [00:03, 945kB/s]\u001b[A\n",
      "Downloading data: 3.42MB [00:03, 1.04MB/s]\u001b[A\n",
      "Downloading data: 3.54MB [00:03, 884kB/s] \u001b[A\n",
      "Downloading data: 3.68MB [00:04, 829kB/s]\u001b[A\n",
      "Downloading data: 3.86MB [00:04, 1.00MB/s]\u001b[A\n",
      "Downloading data: 4.17MB [00:04, 908kB/s] \u001b[A\n",
      "Downloading data files:  50%|███████████████████████████████████████████████████████                                                       | 1/2 [00:06<00:06,  6.38s/it]\n",
      "Downloading data:   0%|                                                                                                                       | 0.00/242k [00:00<?, ?B/s]\u001b[A\n",
      "Downloading data:  20%|█████████████████████▍                                                                                        | 47.3k/242k [00:00<00:02, 81.7kB/s]\u001b[A\n",
      "Downloading data: 251kB [00:00, 342kB/s]                                                                                                                                 \u001b[A\n",
      "Downloading data: 352kB [00:01, 390kB/s]\u001b[A\n",
      "Downloading data: 505kB [00:01, 507kB/s]\u001b[A\n",
      "Downloading data: 750kB [00:01, 516kB/s]\u001b[A\n",
      "Downloading data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.66s/it]\n",
      "Extracting data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 667.56it/s]\n",
      "                                                                                                                                                                         \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset gsm8k downloaded and prepared to /Users/bytedance/.cache/huggingface/datasets/gsm8k/main/1.1.0/37bfb08b1d4fcbb01f06b03d9e1ef5f1fcbd4d3af3d08842c50d7305091285ba. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 136.94it/s]\n"
     ]
    }
   ],
   "source": [
    "gsm8k = load_dataset('gsm8k', 'main')\n",
    "validation_index = np.load('../gsm8k/lib_prompt/validation_index.npy')\n",
    "validation_data = gsm8k['train'].select(validation_index)\n",
    "gsm8k_test = gsm8k['test']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "99da2222-207e-4d78-91a6-527af94c9afd",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gsm8k['train'][0]['question']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0aebf3f1-ff3a-458d-bdd4-42357aafaf20",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "gsm8k_test = gsm8k['test']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "ca22a6a6-73b4-49dd-bf78-1e7b8d93c58f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "prompt_complex = open('../gsm8k/lib_prompt/prompt_hardest.txt').read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "326b98c1-5b9b-41c2-a15b-f676a6814a7a",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\n",
      "Let's think step by step\n",
      "Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\n",
      "For the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\n",
      "Angelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\n",
      "However, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\n",
      "They also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\n",
      "And they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\n",
      "So Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\n",
      "They want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\n",
      "They will need to plan to study 4 days to allow for all the time they need.\n",
      "The answer is 4\n",
      "\n",
      "Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws.  Their opponents score double the 2 pointers but half the 3 pointers and free throws.  What's the total number of points scored by both teams added together?\n",
      "Let's think step by step\n",
      "Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\n",
      "His team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\n",
      "They scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\n",
      "All together his team scored 50+24+10= 84 points\n",
      "Mark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\n",
      "His opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\n",
      "They also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\n",
      "All together Mark's opponents scored 100+12+5=117 points\n",
      "The total score for the game is both team's scores added together, so it is 84+117=201 points\n",
      "The answer is 201\n",
      "\n",
      "Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\n",
      "Let's think step by step\n",
      "When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\n",
      "The total number of marbles she'll have is 60+24 = 84\n",
      "If Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\n",
      "If Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\n",
      "The total number of frisbees she'll have will increase to 30+12 = 42\n",
      "Bella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\n",
      "If she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\n",
      "The total number of deck cards she'll have is 10+4 = 14\n",
      "Together, Bella will have a total of 14+42+84 = 140 items\n",
      "The answer is 140\n",
      "\n",
      "Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\n",
      "Let's think step by step\n",
      "For the first three baskets, the number of apples and oranges in one basket is 9+15=24\n",
      "In total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\n",
      "Since there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\n",
      "The number of apples in the fourth basket is 9-2=7\n",
      "There are also 15-2=13 oranges in the fourth basket\n",
      "The combined number of oranges and apples in the fourth basket is 13+7=20\n",
      "The fourth basket also contains 14-2=12 bananas.\n",
      "In total, the fourth basket has 20+12=32 fruits.\n",
      "The four baskets together have 32+114=146 fruits.\n",
      "The answer is 146\n",
      "\n",
      "Question: You can buy 4 apples or 1 watermelon for the same price. You bought 36 fruits evenly split between oranges, apples and watermelons, and the price of 1 orange is $0.50. How much does 1 apple cost if your total bill was $66?\n",
      "Let's think step by step\n",
      "If 36 fruits were evenly split between 3 types of fruits, then I bought 36/3 = 12 units of each fruit\n",
      "If 1 orange costs $0.50 then 12 oranges will cost $0.50 * 12 = $6\n",
      "If my total bill was $66 and I spent $6 on oranges then I spent $66 - $6 = $60 on the other 2 fruit types.\n",
      "Assuming the price of watermelon is W, and knowing that you can buy 4 apples for the same price and that the price of one apple is A, then 1W=4A\n",
      "If we know we bought 12 watermelons and 12 apples for $60, then we know that $60 = 12W + 12A\n",
      "Knowing that 1W=4A, then we can convert the above to $60 = 12(4A) + 12A\n",
      "$60 = 48A + 12A\n",
      "$60 = 60A\n",
      "Then we know the price of one apple (A) is $60/60= $1\n",
      "The answer is 1\n",
      "\n",
      "Question: Susy goes to a large school with 800 students, while Sarah goes to a smaller school with only 300 students.  At the start of the school year, Susy had 100 social media followers.  She gained 40 new followers in the first week of the school year, half that in the second week, and half of that in the third week.  Sarah only had 50 social media followers at the start of the year, but she gained 90 new followers the first week, a third of that in the second week, and a third of that in the third week.  After three weeks, how many social media followers did the girl with the most total followers have?\n",
      "Let's think step by step\n",
      "After one week, Susy has 100+40 = 140 followers.\n",
      "In the second week, Susy gains 40/2 = 20 new followers.\n",
      "In the third week, Susy gains 20/2 = 10 new followers.\n",
      "In total, Susy finishes the three weeks with 140+20+10 = 170 total followers.\n",
      "After one week, Sarah has 50+90 = 140 followers.\n",
      "After the second week, Sarah gains 90/3 = 30 followers.\n",
      "After the third week, Sarah gains 30/3 = 10 followers.\n",
      "So, Sarah finishes the three weeks with 140+30+10 = 180 total followers.\n",
      "Thus, Sarah is the girl with the most total followers with a total of 180.\n",
      "The answer is 180\n",
      "\n",
      "Question: Sam bought a dozen boxes, each with 30 highlighter pens inside, for $10 each box. He rearranged five of these boxes into packages of six highlighters each and sold them for $3 per package. He sold the rest of the highlighters separately at the rate of three pens for $2. How much profit did he make in total, in dollars?\n",
      "Let's think step by step\n",
      "Sam bought 12 boxes x $10 = $120 worth of highlighters.\n",
      "He bought 12 * 30 = 360 highlighters in total.\n",
      "Sam then took 5 boxes × 6 highlighters/box = 30 highlighters.\n",
      "He sold these boxes for 5 * $3 = $15\n",
      "After selling these 5 boxes there were 360 - 30 = 330 highlighters remaining.\n",
      "These form 330 / 3 = 110 groups of three pens.\n",
      "He sold each of these groups for $2 each, so made 110 * 2 = $220 from them.\n",
      "In total, then, he earned $220 + $15 = $235.\n",
      "Since his original cost was $120, he earned $235 - $120 = $115 in profit.\n",
      "The answer is 115\n",
      "\n",
      "Question: In a certain school, 2/3 of the male students like to play basketball, but only 1/5 of the female students like to play basketball. What percent of the population of the school do not like to play basketball if the ratio of the male to female students is 3:2 and there are 1000 students?\n",
      "Let's think step by step\n",
      "The students are divided into 3 + 2 = 5 parts where 3 parts are for males and 2 parts are for females.\n",
      "Each part represents 1000/5 = 200 students.\n",
      "So, there are 3 x 200 = 600 males.\n",
      "And there are 2 x 200 = 400 females.\n",
      "Hence, 600 x 2/3 = 400 males play basketball.\n",
      "And 400 x 1/5 = 80 females play basketball.\n",
      "A total of 400 + 80 = 480 students play basketball.\n",
      "Therefore, 1000 - 480 = 520 do not like to play basketball.\n",
      "The percentage of the school that do not like to play basketball is 520/1000 * 100 = 52\n",
      "The answer is 52\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(prompt_complex)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e1e5d04e-ce12-4407-a150-4692c20ff4b0",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from tenacity import (\n",
    "    retry,\n",
    "    stop_after_attempt,\n",
    "    wait_chain,\n",
    "    wait_fixed\n",
    ") \n",
    "\n",
    "@retry(wait=wait_chain(*[wait_fixed(3) for i in range(3)] +\n",
    "                       [wait_fixed(5) for i in range(2)] +\n",
    "                       [wait_fixed(10)]))\n",
    "def completion_with_backoff(**kwargs):\n",
    "    return openai.ChatCompletion.create(**kwargs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "1204557b-25f8-458c-a95b-3e2db046595c",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def test_answer(pred_str, ans_str):\n",
    "    pattern = '\\d*\\.?\\d+'\n",
    "    pred = re.findall(pattern, pred_str)\n",
    "    if(len(pred) >= 1):\n",
    "        # print(pred_str)\n",
    "        pred = pred[-1]\n",
    "        gold = re.findall(pattern, ans_str)\n",
    "        # print(ans_str)\n",
    "        gold = gold[-1]\n",
    "        return pred == gold\n",
    "    else: return False\n",
    "\n",
    "def parse_pred_ans(filename):\n",
    "    with open(filename) as fd: lines = fd.readlines()\n",
    "    am, a = None, None\n",
    "    num_q, acc = 0, 0\n",
    "    current_mode = 'none'\n",
    "    questions = []\n",
    "    ans_pred = []\n",
    "    ans_gold = []\n",
    "    for l in lines:\n",
    "        if(l.startswith('Q: ')):\n",
    "            if(am is not None and a is not None):\n",
    "                questions.append(q)\n",
    "                ans_pred.append(am)\n",
    "                ans_gold.append(a)\n",
    "                if(test_answer(am, a)):\n",
    "                    acc += 1\n",
    "            current_mode = 'q'\n",
    "            q = l\n",
    "            num_q += 1\n",
    "        elif(l.startswith('A_model:')):\n",
    "            current_mode = 'am'\n",
    "            am = l\n",
    "        elif(l.startswith('A:')):\n",
    "            current_mode = 'a'\n",
    "            a = l\n",
    "        else:\n",
    "            if(current_mode == 'q'): q += l\n",
    "            elif(current_mode == 'am'): am += l\n",
    "            elif(current_mode == 'a'): a += l\n",
    "            else:\n",
    "                raise ValueError(current_mode)\n",
    "                \n",
    "    questions.append(q)\n",
    "    ans_pred.append(am)\n",
    "    ans_gold.append(a)\n",
    "    if(test_answer(am, a)):\n",
    "        acc += 1\n",
    "    print('num_q %d correct %d ratio %.4f' % (num_q, acc, float(acc / num_q)))\n",
    "    return questions, ans_pred, ans_gold\n",
    "\n",
    "def test_finished(ans_model):\n",
    "    if('answer is' in ans_model): return True\n",
    "    else: return False\n",
    "\n",
    "def extract_ans(ans_model):\n",
    "    ans_model = ans_model.split('\\n')\n",
    "    ans = []\n",
    "    residual = []\n",
    "    for li, al in enumerate(ans_model):\n",
    "        ans.append(al)\n",
    "        if('answer is' in al):\n",
    "            break\n",
    "    residual = list(ans_model[li + 1:])\n",
    "    ans = '\\n'.join(ans)\n",
    "    residual = '\\n'.join(residual)\n",
    "    return ans, residual"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "44e7f8ce-f9e7-4c04-96fa-066c023ba4cc",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "prompt_q = prompt_complex + '\\nQuestion: ' + gsm8k_test[1]['question'] + '\\n'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "54e82439-47e4-4e72-ad44-e479c9db3b1b",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\n",
      "Let's think step by step\n",
      "Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\n",
      "For the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\n",
      "Angelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\n",
      "However, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\n",
      "They also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\n",
      "And they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\n",
      "So Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\n",
      "They want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\n",
      "They will need to plan to study 4 days to allow for all the time they need.\n",
      "The answer is 4\n",
      "\n",
      "Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws.  Their opponents score double the 2 pointers but half the 3 pointers and free throws.  What's the total number of points scored by both teams added together?\n",
      "Let's think step by step\n",
      "Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\n",
      "His team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\n",
      "They scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\n",
      "All together his team scored 50+24+10= 84 points\n",
      "Mark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\n",
      "His opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\n",
      "They also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\n",
      "All together Mark's opponents scored 100+12+5=117 points\n",
      "The total score for the game is both team's scores added together, so it is 84+117=201 points\n",
      "The answer is 201\n",
      "\n",
      "Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\n",
      "Let's think step by step\n",
      "When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\n",
      "The total number of marbles she'll have is 60+24 = 84\n",
      "If Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\n",
      "If Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\n",
      "The total number of frisbees she'll have will increase to 30+12 = 42\n",
      "Bella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\n",
      "If she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\n",
      "The total number of deck cards she'll have is 10+4 = 14\n",
      "Together, Bella will have a total of 14+42+84 = 140 items\n",
      "The answer is 140\n",
      "\n",
      "Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\n",
      "Let's think step by step\n",
      "For the first three baskets, the number of apples and oranges in one basket is 9+15=24\n",
      "In total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\n",
      "Since there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\n",
      "The number of apples in the fourth basket is 9-2=7\n",
      "There are also 15-2=13 oranges in the fourth basket\n",
      "The combined number of oranges and apples in the fourth basket is 13+7=20\n",
      "The fourth basket also contains 14-2=12 bananas.\n",
      "In total, the fourth basket has 20+12=32 fruits.\n",
      "The four baskets together have 32+114=146 fruits.\n",
      "The answer is 146\n",
      "\n",
      "Question: You can buy 4 apples or 1 watermelon for the same price. You bought 36 fruits evenly split between oranges, apples and watermelons, and the price of 1 orange is $0.50. How much does 1 apple cost if your total bill was $66?\n",
      "Let's think step by step\n",
      "If 36 fruits were evenly split between 3 types of fruits, then I bought 36/3 = 12 units of each fruit\n",
      "If 1 orange costs $0.50 then 12 oranges will cost $0.50 * 12 = $6\n",
      "If my total bill was $66 and I spent $6 on oranges then I spent $66 - $6 = $60 on the other 2 fruit types.\n",
      "Assuming the price of watermelon is W, and knowing that you can buy 4 apples for the same price and that the price of one apple is A, then 1W=4A\n",
      "If we know we bought 12 watermelons and 12 apples for $60, then we know that $60 = 12W + 12A\n",
      "Knowing that 1W=4A, then we can convert the above to $60 = 12(4A) + 12A\n",
      "$60 = 48A + 12A\n",
      "$60 = 60A\n",
      "Then we know the price of one apple (A) is $60/60= $1\n",
      "The answer is 1\n",
      "\n",
      "Question: Susy goes to a large school with 800 students, while Sarah goes to a smaller school with only 300 students.  At the start of the school year, Susy had 100 social media followers.  She gained 40 new followers in the first week of the school year, half that in the second week, and half of that in the third week.  Sarah only had 50 social media followers at the start of the year, but she gained 90 new followers the first week, a third of that in the second week, and a third of that in the third week.  After three weeks, how many social media followers did the girl with the most total followers have?\n",
      "Let's think step by step\n",
      "After one week, Susy has 100+40 = 140 followers.\n",
      "In the second week, Susy gains 40/2 = 20 new followers.\n",
      "In the third week, Susy gains 20/2 = 10 new followers.\n",
      "In total, Susy finishes the three weeks with 140+20+10 = 170 total followers.\n",
      "After one week, Sarah has 50+90 = 140 followers.\n",
      "After the second week, Sarah gains 90/3 = 30 followers.\n",
      "After the third week, Sarah gains 30/3 = 10 followers.\n",
      "So, Sarah finishes the three weeks with 140+30+10 = 180 total followers.\n",
      "Thus, Sarah is the girl with the most total followers with a total of 180.\n",
      "The answer is 180\n",
      "\n",
      "Question: Sam bought a dozen boxes, each with 30 highlighter pens inside, for $10 each box. He rearranged five of these boxes into packages of six highlighters each and sold them for $3 per package. He sold the rest of the highlighters separately at the rate of three pens for $2. How much profit did he make in total, in dollars?\n",
      "Let's think step by step\n",
      "Sam bought 12 boxes x $10 = $120 worth of highlighters.\n",
      "He bought 12 * 30 = 360 highlighters in total.\n",
      "Sam then took 5 boxes × 6 highlighters/box = 30 highlighters.\n",
      "He sold these boxes for 5 * $3 = $15\n",
      "After selling these 5 boxes there were 360 - 30 = 330 highlighters remaining.\n",
      "These form 330 / 3 = 110 groups of three pens.\n",
      "He sold each of these groups for $2 each, so made 110 * 2 = $220 from them.\n",
      "In total, then, he earned $220 + $15 = $235.\n",
      "Since his original cost was $120, he earned $235 - $120 = $115 in profit.\n",
      "The answer is 115\n",
      "\n",
      "Question: In a certain school, 2/3 of the male students like to play basketball, but only 1/5 of the female students like to play basketball. What percent of the population of the school do not like to play basketball if the ratio of the male to female students is 3:2 and there are 1000 students?\n",
      "Let's think step by step\n",
      "The students are divided into 3 + 2 = 5 parts where 3 parts are for males and 2 parts are for females.\n",
      "Each part represents 1000/5 = 200 students.\n",
      "So, there are 3 x 200 = 600 males.\n",
      "And there are 2 x 200 = 400 females.\n",
      "Hence, 600 x 2/3 = 400 males play basketball.\n",
      "And 400 x 1/5 = 80 females play basketball.\n",
      "A total of 400 + 80 = 480 students play basketball.\n",
      "Therefore, 1000 - 480 = 520 do not like to play basketball.\n",
      "The percentage of the school that do not like to play basketball is 520/1000 * 100 = 52\n",
      "The answer is 52\n",
      "\n",
      "Question: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(prompt_q)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "d6597b15-0a08-4ff7-b8b1-ba090cd91e16",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "response = openai.ChatCompletion.create(\n",
    "  model=\"gpt-3.5-turbo\",\n",
    "  messages=[\n",
    "        {\"role\": \"system\", \"content\": \"Follow the given examples and answer the question.\"},\n",
    "        {\"role\": \"user\", \"content\": prompt_q},\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "46225b43-db46-4cd6-8ac6-98c41a2f0506",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"Let's think step by step\\nSince a robe takes 2 bolts of blue fiber and half that much white fiber, the amount of white fiber needed is 1 bolt (half of 2 bolts). \\nThe total amount of fiber needed is 2 bolts of blue fiber + 1 bolt of white fiber = 3 bolts of fiber.\\nTherefore, it takes 3 bolts in total to make the robe. \\nThe answer is 3.\""
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "response['choices'][0]['message']['content']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "14ca497f-efaa-4410-afaf-4f70e1e37605",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Let's think step by step\n",
      "Since a robe takes 2 bolts of blue fiber and half that much white fiber, the amount of white fiber needed is 1 bolt (half of 2 bolts). \n",
      "The total amount of fiber needed is 2 bolts of blue fiber + 1 bolt of white fiber = 3 bolts of fiber.\n",
      "Therefore, it takes 3 bolts in total to make the robe. \n",
      "The answer is 3.\n"
     ]
    }
   ],
   "source": [
    "print(response['choices'][0]['message']['content'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bfb3e426-9907-4d73-81d5-3ba0898380ef",
   "metadata": {},
   "source": [
    "# Complex Prompt Random Sampling, Acc 77.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "329eaf72-c549-453d-9a4c-7e1ab32400c2",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1319/1319 [2:21:18<00:00,  6.43s/it]\n"
     ]
    }
   ],
   "source": [
    "i = 0\n",
    "with open('outputs/test_gpt_3.5_turbo_complex.txt', 'w') as fd:\n",
    "    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), \n",
    "                               total=len(gsm8k_test['question'])):\n",
    "        \n",
    "        prompt_q = prompt_complex + '\\nQuestion: ' + q + '\\n'  \n",
    "        \n",
    "        response = completion_with_backoff(\n",
    "              model=\"gpt-3.5-turbo\",\n",
    "              messages=[\n",
    "                    {\"role\": \"system\", \"content\": \"Follow the given examples and answer the question.\"},\n",
    "                    {\"role\": \"user\", \"content\": prompt_q},\n",
    "                ]\n",
    "            )\n",
    "        ans_model = response['choices'][0]['message']['content']\n",
    "        ans_, residual = extract_ans(ans_model)\n",
    "            \n",
    "        fd.write('Q: %s\\nA_model:\\n%s\\nA:\\n%s\\n\\n' % (q, ans_, a))\n",
    "        i += 1\n",
    "        # if(i == 2): break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "efef9d01-229f-4b68-8ff7-bb1d9fca80ba",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "num_q 1319 correct 1017 ratio 0.7710\n"
     ]
    }
   ],
   "source": [
    "_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_complex.txt')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1ae3f728-c9c0-483f-b89e-5a4080554d70",
   "metadata": {},
   "source": [
    "# Complex Prompt Greedy Decoding, Acc 78.85"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "265654fb-e87f-479e-b20d-91913179d309",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1319/1319 [2:04:18<00:00,  5.65s/it]\n"
     ]
    }
   ],
   "source": [
    "i = 0\n",
    "with open('outputs/test_gpt_3.5_turbo_complex_temp_0.txt', 'w') as fd:\n",
    "    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), \n",
    "                               total=len(gsm8k_test['question'])):\n",
    "        \n",
    "        prompt_q = prompt_complex + '\\nQuestion: ' + q + '\\n'  \n",
    "        \n",
    "        response = completion_with_backoff(\n",
    "              model=\"gpt-3.5-turbo\",\n",
    "              messages=[\n",
    "                    {\"role\": \"system\", \"content\": \"Follow the given examples and answer the question.\"},\n",
    "                    {\"role\": \"user\", \"content\": prompt_q},\n",
    "                ],\n",
    "                temperature=0\n",
    "            )\n",
    "        ans_model = response['choices'][0]['message']['content']\n",
    "        ans_, residual = extract_ans(ans_model)\n",
    "            \n",
    "        fd.write('Q: %s\\nA_model:\\n%s\\nA:\\n%s\\n\\n' % (q, ans_, a))\n",
    "        i += 1\n",
    "        # if(i == 2): break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "aad45c72-2349-4be3-a76a-7f0f0f5d8e06",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "num_q 1319 correct 1040 ratio 0.7885\n"
     ]
    }
   ],
   "source": [
    "_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_complex_temp_0.txt')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a7d6083d-7f10-4e1e-b947-565fa72e0bd0",
   "metadata": {},
   "source": [
    "# Baseline Prompt Greedy Decoding, Acc 74.98"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "444db3b1-7352-4dc9-a717-dc808e3961ed",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "prompt_original = open('../gsm8k/lib_prompt/prompt_original.txt').read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "d5ce8d3d-9b1f-41ca-bfbe-2197d4602526",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1319/1319 [2:36:17<00:00,  7.11s/it]\n"
     ]
    }
   ],
   "source": [
    "i = 0\n",
    "with open('outputs/test_gpt_3.5_turbo_original_temp_0.txt', 'w') as fd:\n",
    "    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), \n",
    "                               total=len(gsm8k_test['question'])):\n",
    "        \n",
    "        prompt_q = prompt_original + '\\nQuestion: ' + q + '\\n'  \n",
    "        \n",
    "        response = completion_with_backoff(\n",
    "              model=\"gpt-3.5-turbo\",\n",
    "              messages=[\n",
    "                    {\"role\": \"system\", \"content\": \"Follow the given examples and answer the question.\"},\n",
    "                    {\"role\": \"user\", \"content\": prompt_q},\n",
    "                ],\n",
    "                temperature=0\n",
    "            )\n",
    "        ans_model = response['choices'][0]['message']['content']\n",
    "        ans_, residual = extract_ans(ans_model)\n",
    "            \n",
    "        fd.write('Q: %s\\nA_model:\\n%s\\nA:\\n%s\\n\\n' % (q, ans_, a))\n",
    "        i += 1\n",
    "        # if(i == 2): break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "3d695da9-7324-4d72-9d52-ace669da7443",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "num_q 1319 correct 989 ratio 0.7498\n"
     ]
    }
   ],
   "source": [
    "_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_original_temp_0.txt')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6951e74d-0b95-45bb-88e3-b34e3a4472d6",
   "metadata": {},
   "source": [
    "# Baseline Prompt, Dialog In-Context Learning, Acc 76.8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "65b6a7cd-8e7c-4988-96b5-0afb759d3bcf",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def make_dialog_prompt(prompt):\n",
    "    messages = []\n",
    "    messages.append({\"role\": \"system\", \"content\": \"Follow the given examples and answer the question.\"})\n",
    "    cases = prompt.split(\"\\n\\n\")\n",
    "    for c in cases[:-1]:\n",
    "        question = c.split(\"\\n\")[:2]\n",
    "        messages.append({\"role\": \"user\", \"content\": \"\\n\".join(question)})\n",
    "        answer = c.split(\"\\n\")[2:]\n",
    "        messages.append({\"role\": \"assistant\", \"content\": \"\\n\".join(answer)})\n",
    "    messages.append({\"role\": \"user\", \"content\": cases[-1] + \"Let's think step by step\"})\n",
    "    return messages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "a76b5c24-d6ea-4bdf-b9e8-956eb52eb559",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1319/1319 [2:57:13<00:00,  8.06s/it]\n"
     ]
    }
   ],
   "source": [
    "i = 0\n",
    "with open('outputs/test_gpt_3.5_turbo_original_dialog_icl.txt', 'w') as fd:\n",
    "    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), \n",
    "                               total=len(gsm8k_test['question'])):\n",
    "        \n",
    "        prompt_q = prompt_original + '\\nQuestion: ' + q + '\\n'\n",
    "        dialog_prompt = make_dialog_prompt(prompt_q)\n",
    "        \n",
    "        response = completion_with_backoff(\n",
    "              model=\"gpt-3.5-turbo\",\n",
    "              messages=dialog_prompt,\n",
    "              temperature=0\n",
    "            )\n",
    "        ans_model = response['choices'][0]['message']['content']\n",
    "        ans_, residual = extract_ans(ans_model)\n",
    "            \n",
    "        fd.write('Q: %s\\nA_model:\\n%s\\nA:\\n%s\\n\\n' % (q, ans_, a))\n",
    "        i += 1\n",
    "        # if(i == 2): break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "ec899995-c9f3-458a-a290-8433b882f21d",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "num_q 1319 correct 1013 ratio 0.7680\n"
     ]
    }
   ],
   "source": [
    "_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_original_dialog_icl.txt')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c64368f0-ec02-49ef-bfe2-ad2357d578c4",
   "metadata": {},
   "source": [
    "# Complex Prompt, Dialog In-Context Learning, Acc "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "6f1bf1b1-f4c6-47e1-b876-168eec3c47d2",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1319/1319 [2:43:01<00:00,  7.42s/it]\n"
     ]
    }
   ],
   "source": [
    "i = 0\n",
    "with open('outputs/test_gpt_3.5_turbo_complex_dialog_icl.txt', 'w') as fd:\n",
    "    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), \n",
    "                               total=len(gsm8k_test['question'])):\n",
    "        \n",
    "        prompt_q = prompt_complex + '\\nQuestion: ' + q + '\\n'\n",
    "        dialog_prompt = make_dialog_prompt(prompt_q)\n",
    "        \n",
    "        response = completion_with_backoff(\n",
    "              model=\"gpt-3.5-turbo\",\n",
    "              messages=dialog_prompt,\n",
    "              temperature=0\n",
    "            )\n",
    "        ans_model = response['choices'][0]['message']['content']\n",
    "        ans_, residual = extract_ans(ans_model)\n",
    "            \n",
    "        fd.write('Q: %s\\nA_model:\\n%s\\nA:\\n%s\\n\\n' % (q, ans_, a))\n",
    "        i += 1\n",
    "        # if(i == 2): break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "58b7f396-a31d-4206-aefe-8d49b9130bdd",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "num_q 1319 correct 988 ratio 0.7491\n"
     ]
    }
   ],
   "source": [
    "_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_complex_dialog_icl.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6be091b7-083b-4ba9-a83d-c3628494c782",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
