{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "60d3199a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-08T20:03:51.177619Z",
     "start_time": "2023-03-08T20:03:51.173308Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import openai"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "1a491097",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-08T20:03:51.549167Z",
     "start_time": "2023-03-08T20:03:51.526286Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>topics</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>The impact of social media on society</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Climate change and its effects on the planet</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The benefits of a healthy diet and exercise</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>The importance of education in society</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The ethics of animal testing</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>495</th>\n",
       "      <td>The role of public transportation in promoting...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496</th>\n",
       "      <td>The importance of cultural sensitivity in heal...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>497</th>\n",
       "      <td>The impact of artificial intelligence on creat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>498</th>\n",
       "      <td>The effects of climate change on marine life</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499</th>\n",
       "      <td>The causes and effects of soil degradation.</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>500 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                topics\n",
       "0                The impact of social media on society\n",
       "1         Climate change and its effects on the planet\n",
       "2          The benefits of a healthy diet and exercise\n",
       "3               The importance of education in society\n",
       "4                         The ethics of animal testing\n",
       "..                                                 ...\n",
       "495  The role of public transportation in promoting...\n",
       "496  The importance of cultural sensitivity in heal...\n",
       "497  The impact of artificial intelligence on creat...\n",
       "498       The effects of climate change on marine life\n",
       "499        The causes and effects of soil degradation.\n",
       "\n",
       "[500 rows x 1 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\"passage_topics.csv\")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "43208822",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-08T20:03:53.043872Z",
     "start_time": "2023-03-08T20:03:53.039341Z"
    }
   },
   "outputs": [],
   "source": [
    "ip_list2 = list(df['topics'])\n",
    "ip_list2 = ip_list2[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "2d9d21a0",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-08T20:03:53.776862Z",
     "start_time": "2023-03-08T20:03:53.772859Z"
    }
   },
   "outputs": [],
   "source": [
    "prompt1 = '''Write a passage of about 500 words on the topic below:\n",
    "\n",
    "Topic: '''\n",
    "\n",
    "\n",
    "\n",
    "# prompt2 = \"\\nOutput: \""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "7379287a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-08T20:03:55.983483Z",
     "start_time": "2023-03-08T20:03:55.979968Z"
    }
   },
   "outputs": [],
   "source": [
    "openai.api_key = \"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\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "5a3952df",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-08T20:04:08.217985Z",
     "start_time": "2023-03-08T20:04:08.210878Z"
    }
   },
   "outputs": [],
   "source": [
    "def chat_gpt_response(sample):\n",
    "    final_prompt = prompt1 + sample\n",
    "#     print(\"final prompt----------------\",final_prompt)\n",
    "    response=openai.ChatCompletion.create(\n",
    "        model=\"gpt-3.5-turbo\",\n",
    "        messages= [{\"role\": \"user\", \"content\": final_prompt}],\n",
    "        temperature=0.7,\n",
    "        max_tokens=500,\n",
    "        top_p=1,\n",
    "        frequency_penalty=0,\n",
    "        presence_penalty=0)\n",
    "    \n",
    "    reply = response[\"choices\"][0][\"message\"][\"content\"]\n",
    "    return reply"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "e89a5387",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-08T20:07:18.540330Z",
     "start_time": "2023-03-08T20:04:15.095899Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "shard number: 1 Done\n",
      "shard number: 2 Done\n",
      "shard number: 3 Done\n",
      "shard number: 4 Done\n",
      "shard number: 5 Done\n"
     ]
    }
   ],
   "source": [
    "count = 0\n",
    "\n",
    "batch_size = 2  #Tunable parameter\n",
    "\n",
    "while len(ip_list2) > 0:\n",
    "    temp_list = ip_list2[:batch_size]\n",
    "#     temp_op_list = op_list2[:batch_size]    \n",
    "    output_list = []\n",
    "\n",
    "    for item in temp_list:\n",
    "        output_list.append(chat_gpt_response(item))\n",
    "#         output_list.append(sample_response(item))\n",
    "    \n",
    "    data = {'Input': temp_list ,'Output': output_list }\n",
    "    df_ans = pd.DataFrame(data)\n",
    "    df_ans.to_csv(\"passage_output_shard_\" + str(count) + \".csv\")\n",
    "    count+=1\n",
    "    print(\"shard number:\",count,\"Done\")\n",
    "    ip_list2 = ip_list2[batch_size:]\n",
    "#     op_list2 = op_list2[batch_size:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "65178340",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T06:06:33.158323Z",
     "start_time": "2023-03-10T06:06:33.153760Z"
    }
   },
   "outputs": [],
   "source": [
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d104cb20",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T06:06:37.345050Z",
     "start_time": "2023-03-10T06:06:33.498270Z"
    }
   },
   "outputs": [],
   "source": [
    "folder_path = \"/Users/him1411/Desktop/ra_work/project_5_gpt3_dataset_generation/data/dont_open/\"\n",
    "files = os.listdir(folder_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "13bfa95b",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T06:06:37.352473Z",
     "start_time": "2023-03-10T06:06:37.347372Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'article_162679.txt'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d9851512",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T06:06:37.358856Z",
     "start_time": "2023-03-10T06:06:37.354622Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'article_475680.txt'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a93e4b10",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T06:06:37.364891Z",
     "start_time": "2023-03-10T06:06:37.361298Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'article_293638.txt'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "656cf1a6",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T06:07:24.895674Z",
     "start_time": "2023-03-10T06:06:55.869839Z"
    }
   },
   "outputs": [],
   "source": [
    "passage_list = []\n",
    "\n",
    "for file in files:\n",
    "    path = folder_path + str(file)\n",
    "    with open(path, 'r') as file:\n",
    "        data = file.read()\n",
    "        \n",
    "        if len(data.split()) < 400 and len(data.split()) > 300:\n",
    "            passage_list.append(data)\n",
    "            \n",
    "        if len(passage_list)> 10000:\n",
    "            break\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "56a22ca6",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T06:07:43.968949Z",
     "start_time": "2023-03-10T06:07:43.962599Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"By . Tara Brady . PUBLISHED: . 14:00 EST, 7 January 2014 . | . UPDATED: . 19:04 EST, 7 January 2014 . Surfers are making the most of the weather by riding massive 65ft waves off the French coast following the recent storms. The Belharra giant waves are only surfed by experts who are towed out by a water scooter to catch the surf. Thanks to certain climatic conditions in autumn and winter, a strong swell hits the Belharra Perdun underwater spur enabling a 30ft to 65ft wave to form. Monster wave: A man surfs the Belharra giant waves off the French coast which reach up to 20 metres high thanks to weather conditions . Among those who have made the trip is Shane Dorian, 41, who has travelled from Hawaii and caught just two of the 20-metre waves today. The waves today moved at 25mph. He told the Guardian: I was pretty nervous because there was some heat behind the swell. I had some butterflies in my stomach.' Europe is not normally associated with big waves but this monster is up there with some of the biggest waves in the world. Belharra is situated a mile and a half offshore from Saint Jean De Luz in the heart of the French basque region. Rather you than me: The Belharra giant waves are only surfed by experts who are towed out by a water scooter to catch the monster surf . Belharra is situated a mile and a half offshore from Saint Jean De Luz in the heart of the French basque region. A couple of surfers rode the Belharra in 2003 at 60ft, but even to this day no one really knows what size it will hold. Epic: Thanks to certain climatic conditions in autumn and winter, a strong swell hits the Belharra Perdun underwater spur enabling a 10 to 20 metre wave to form . The wave only breaks on rare occasions marching in along a deep ocean trench and unloading open ocean power with waves reaching epic proportions. A couple of local surfers rode the Belharra in 2003 at 60ft, but even to this day no one really knows what size it will hold.\""
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "passage_list[9999]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "d65835fd",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T06:08:00.136671Z",
     "start_time": "2023-03-10T06:07:58.353434Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "748dcbce",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-10T06:08:00.979739Z",
     "start_time": "2023-03-10T06:08:00.512736Z"
    }
   },
   "outputs": [],
   "source": [
    "df = pd.DataFrame(passage_list, columns=['Passage'])\n",
    "\n",
    "df.to_csv('big_passage_list.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0f69e5c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:root] *",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
