{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "id": "iHdhpZnyvJ6x"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import openai\n",
    "import pandas as pd\n",
    "from collections import deque\n",
    "from tqdm import tqdm\n",
    "\n",
    "openai.api_key = \"INSERT API TOKEN\" #Use cautiously "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "kmuYu-Vc_zKB",
    "outputId": "94acc5fc-89fa-47bd-fbe1-77ff93b04163"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
     ]
    }
   ],
   "source": [
    "from google.colab import drive\n",
    "drive.mount('/content/drive')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "id": "K4WJZbaC_32S"
   },
   "outputs": [],
   "source": [
    "def promptGen(oneshot, premise):\n",
    "  p = oneshot + \"\\n\\nPremise: \"+ premise+\"\\n\\n\"\n",
    "  return p\n",
    "\n",
    "def gpt3(oneshot, premise,prmpt):\n",
    "  allData = \"\"\n",
    "  ppt = oneshot + \"\\n\" + \"\\n\\n\" + prmpt  + \"\\n\\nPremise: \\n\" + premise + \"\\n\\nHypothesis: \\n\"\n",
    "  \n",
    "  response = openai.Completion.create(\n",
    "  model=\"text-davinci-002\",\n",
    "  prompt=ppt,\n",
    "  temperature=0.7,\n",
    "  max_tokens=512,\n",
    "  top_p=1,\n",
    "  frequency_penalty=2,\n",
    "  presence_penalty=2\n",
    "  )\n",
    "  # print(\"+++++++++++++++++++++++++++++++\")\n",
    "  # print(\"ppt----------> \",ppt, \"\\n GPT3:\\n\", response['choices'][0]['text'])\n",
    "  allData = {\"Premise\":premise,\"GPT3 Response\":\"Hypothesis:\\n\"+ response['choices'][0]['text']}\n",
    "\n",
    "  return allData"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "id": "u_VshJ38__1Y"
   },
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"/content/drive/MyDrive/Colab Notebooks/DataGen/rte_premise_list.csv\")\n",
    "df.head()\n",
    "dfcopy = df.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "zITI1BZ3njJ1",
    "outputId": "5e5a8b9f-741b-4463-e5fb-fdac35a2bc6f"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1204, 1)"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfcopy.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "OHO4rGZUl5yM"
   },
   "source": [
    "## entailment "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "28WO_6IAXsU-",
    "outputId": "b37db76e-d1a9-4351-8d52-48f1c2ba8857"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|          | 4/400 [00:07<11:34,  1.75s/it]\n"
     ]
    }
   ],
   "source": [
    "data = []\n",
    "pathtosave = \"/content/drive/MyDrive/Colab Notebooks/DataGen/Commit\"\n",
    "shardno = 1\n",
    "num = 400 #Num of Passage or data to generate\n",
    "oneshot = '''Given a premise, the task is to generate a hypothesis and a label if it is \"entailment\". An example is given below to help you out. \n",
    "\n",
    "Premise: \n",
    "A place of sorrow after Pope John Paul II died became a place of celebration as Roman Catholic faithful gathered in downtown Chicago to mark the installation of new Pope Benedict XVI.\n",
    "\n",
    "Hypothesis:\n",
    "Pope Benedict XVI is the new leader of the Roman Catholic Church.\n",
    "\n",
    "Label:\n",
    "entailment'''\n",
    "#change ppt\n",
    "ppt = '''Generate hypothesis and label now. Be diverse in example generation.  Note that the label should be \"entailment\". '''\n",
    "df = dfcopy[0:2500]\n",
    "for i in tqdm(range(df.shape[0])):\n",
    "  if data!=[] and len(data)%100==0:\n",
    "    print(\"Shard no: \", shardno)\n",
    "    dataDF = pd.DataFrame(data)\n",
    "    dataDF.to_csv(pathtosave + f\"/shard_oneshot_entailment_rte_{shardno}.csv\", index=False)\n",
    "    shardno+=1\n",
    "    data = []\n",
    "  if num==0:\n",
    "    break\n",
    "  # prompt = promptGen(oneshot, df.iloc[i]['Premise List'])\n",
    "  # print(prompt)\n",
    "  d = gpt3(oneshot,df.iloc[i]['Premise List'],ppt)\n",
    "  data.append(d)\n",
    "  num-=1\n",
    "dataDF = pd.DataFrame(data)\n",
    "dataDF.to_csv(pathtosave + f\"/shard_oneshot_entailment_rte_{shardno}.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "KZr2IS2amPOr"
   },
   "source": [
    "## Contradiction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "9OoSnXhAmOtg",
    "outputId": "03faf5ec-d6d9-46c6-8568-f6dfac8f5512"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|          | 4/400 [00:06<11:10,  1.69s/it]\n"
     ]
    }
   ],
   "source": [
    "data = []\n",
    "pathtosave = \"/content/drive/MyDrive/Colab Notebooks/DataGen/Commit\"\n",
    "shardno = 1\n",
    "num = 400 #Num of Passage or data to generate\n",
    "oneshot = '''Given a premise, the task is to generate a hypothesis and a label if it is \"not entailment\". An example is given below to help you out. \n",
    "\n",
    "Premise:\n",
    "A man is due in court later charged with the murder 26 years ago of a teenager whose case was the first to be featured on BBC One's Crimewatch. Colette Aram, 16, was walking to her boyfriend's house in Keyworth, Nottinghamshire, on 30 October 1983 when she disappeared. Her body was later found in a field close to her home. Paul Stewart Hutchinson, 50, has been charged with murder and is due before Nottingham magistrates later.\n",
    "\n",
    "Hypothesis:\n",
    "Paul Stewart Hutchinson is accused of having stabbed a girl.\n",
    "\n",
    "Label:\n",
    "not entailment'''\n",
    "#change ppt\n",
    "ppt = '''Generate hypothesis and label now. Be diverse in example generation.  Note that the label should be \"not entailment\".'''\n",
    "df = dfcopy[2500:5000]\n",
    "for i in tqdm(range(df.shape[0])):\n",
    "  if data!=[] and len(data)%100==0:\n",
    "    print(\"Shard no: \", shardno)\n",
    "    dataDF = pd.DataFrame(data)\n",
    "    dataDF.to_csv(pathtosave + f\"/shard_oneshot_not_entailment_rte_{shardno}.csv\", index=False)\n",
    "    shardno+=1\n",
    "    data = []\n",
    "  if num==0:\n",
    "    break\n",
    "  # prompt = promptGen(oneshot, df.iloc[i]['Premise List'])\n",
    "  # print(prompt)\n",
    "  d = gpt3(oneshot,df.iloc[i]['Premise List'],ppt)\n",
    "  data.append(d)\n",
    "  num-=1\n",
    "dataDF = pd.DataFrame(data)\n",
    "dataDF.to_csv(pathtosave + f\"/shard_oneshot_not_entailment_rte_{shardno}.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "yEOynQFhnIpB"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [
    "NKB7cAmGbEEu"
   ],
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
