{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "112ff5e2",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-20T19:30:07.115475Z",
     "start_time": "2023-07-20T19:30:06.839558Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e006be86",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-20T19:30:28.499174Z",
     "start_time": "2023-07-20T19:30:28.434086Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Input</th>\n",
       "      <th>Output</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Users are being asked to download an app that ...</td>\n",
       "      <td>Query: In 2017, telecoms operators will conduc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>The champions top the table going into the New...</td>\n",
       "      <td>Query: The champions hope to secure their lead...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The Mail on Sunday reported Afzal Amin plotted...</td>\n",
       "      <td>Query: The secretly filmed footage obtained by...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>By . Rob Waugh . UPDATED: . 12:43 EST, 22 Dece...</td>\n",
       "      <td>Query: The spectrograph discovered a strong 'a...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Media playback is not supported on this device...</td>\n",
       "      <td>Query: Benaud was known for his expertise in [...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2295</th>\n",
       "      <td>The Primary Modern Languages Programme has bee...</td>\n",
       "      <td>Query: Despite the budget cuts, the Department...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2296</th>\n",
       "      <td>(CNN) -- The family battle that surrounded rad...</td>\n",
       "      <td>Query: Despite Casey Kasem's passing, his body...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2297</th>\n",
       "      <td>(CNN) -- The Dallas Mavericks took a giant lea...</td>\n",
       "      <td>Query: Despite playing through a hip injury, [...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2298</th>\n",
       "      <td>Doula, Cameroon (CNN) -- Gunmen opposed to Cam...</td>\n",
       "      <td>Query: Eyewitnesses reported that the gunmen, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2299</th>\n",
       "      <td>This is the photo taken seconds before a safar...</td>\n",
       "      <td>Query: If they had followed the advice of the ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2300 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  Input  \\\n",
       "0     Users are being asked to download an app that ...   \n",
       "1     The champions top the table going into the New...   \n",
       "2     The Mail on Sunday reported Afzal Amin plotted...   \n",
       "3     By . Rob Waugh . UPDATED: . 12:43 EST, 22 Dece...   \n",
       "4     Media playback is not supported on this device...   \n",
       "...                                                 ...   \n",
       "2295  The Primary Modern Languages Programme has bee...   \n",
       "2296  (CNN) -- The family battle that surrounded rad...   \n",
       "2297  (CNN) -- The Dallas Mavericks took a giant lea...   \n",
       "2298  Doula, Cameroon (CNN) -- Gunmen opposed to Cam...   \n",
       "2299  This is the photo taken seconds before a safar...   \n",
       "\n",
       "                                                 Output  \n",
       "0     Query: In 2017, telecoms operators will conduc...  \n",
       "1     Query: The champions hope to secure their lead...  \n",
       "2     Query: The secretly filmed footage obtained by...  \n",
       "3     Query: The spectrograph discovered a strong 'a...  \n",
       "4     Query: Benaud was known for his expertise in [...  \n",
       "...                                                 ...  \n",
       "2295  Query: Despite the budget cuts, the Department...  \n",
       "2296  Query: Despite Casey Kasem's passing, his body...  \n",
       "2297  Query: Despite playing through a hip injury, [...  \n",
       "2298  Query: Eyewitnesses reported that the gunmen, ...  \n",
       "2299  Query: If they had followed the advice of the ...  \n",
       "\n",
       "[2300 rows x 2 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\"record_output_responses.csv\")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "d9a68a93",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-20T21:00:45.576331Z",
     "start_time": "2023-07-20T21:00:45.569628Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'In 2017, telecoms operators will conduct a detailed comparison study using data that has been [X].'"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Output'][0].split(\"Answer:\")[0].split(\"Query:\")[1].strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "d5907454",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-20T21:22:13.042710Z",
     "start_time": "2023-07-20T21:22:13.029610Z"
    }
   },
   "outputs": [],
   "source": [
    "passages = list(df['Input'])\n",
    "responses = list(df['Output'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "50d8ffec",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-20T21:30:24.511946Z",
     "start_time": "2023-07-20T21:30:24.483075Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "list index out of range\n",
      "1777\n",
      "1777\n"
     ]
    }
   ],
   "source": [
    "ip_list = []\n",
    "op_list = []\n",
    "for x,temp in zip(passages,responses):\n",
    "    try:\n",
    "        question = temp.split(\"Answer:\")[0].split(\"Query:\")[1].strip()\n",
    "        answer = temp.split(\"Answer:\")[1].split(\"Explanation\")[0].strip()\n",
    "        if '[X]' in question:\n",
    "#             print(x)\n",
    "#             print(question)\n",
    "            ip = \"Passage: \" + x + \"\\nQuestion:\" + question\n",
    "            ip_list.append(ip)\n",
    "            op_list.append(answer)\n",
    "#             print(answer)\n",
    "#             print(\"----------------------------\")\n",
    "    except Exception as e:\n",
    "#         continue\n",
    "        print(e)\n",
    "\n",
    "#     break\n",
    "\n",
    "print(len(ip_list))\n",
    "print(len(op_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "34354934",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-20T21:30:55.631028Z",
     "start_time": "2023-07-20T21:30:55.607755Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>input</th>\n",
       "      <th>output</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Passage: Users are being asked to download an ...</td>\n",
       "      <td>anonymised</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Passage: The champions top the table going int...</td>\n",
       "      <td>tactics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Passage: The Mail on Sunday reported Afzal Ami...</td>\n",
       "      <td>Mr. Amin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Passage: By . Rob Waugh . UPDATED: . 12:43 EST...</td>\n",
       "      <td>Pluto</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Passage: Media playback is not supported on th...</td>\n",
       "      <td>leg-spin bowling</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1772</th>\n",
       "      <td>Passage: The Primary Modern Languages Programm...</td>\n",
       "      <td>Four hundred and thirteen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1773</th>\n",
       "      <td>Passage: (CNN) -- The family battle that surro...</td>\n",
       "      <td>Casey Kasem</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1774</th>\n",
       "      <td>Passage: (CNN) -- The Dallas Mavericks took a ...</td>\n",
       "      <td>Dwyane Wade</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1775</th>\n",
       "      <td>Passage: Doula, Cameroon (CNN) -- Gunmen oppos...</td>\n",
       "      <td>President Paul Biya</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1776</th>\n",
       "      <td>Passage: This is the photo taken seconds befor...</td>\n",
       "      <td>Chantal Beyer</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1777 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  input  \\\n",
       "0     Passage: Users are being asked to download an ...   \n",
       "1     Passage: The champions top the table going int...   \n",
       "2     Passage: The Mail on Sunday reported Afzal Ami...   \n",
       "3     Passage: By . Rob Waugh . UPDATED: . 12:43 EST...   \n",
       "4     Passage: Media playback is not supported on th...   \n",
       "...                                                 ...   \n",
       "1772  Passage: The Primary Modern Languages Programm...   \n",
       "1773  Passage: (CNN) -- The family battle that surro...   \n",
       "1774  Passage: (CNN) -- The Dallas Mavericks took a ...   \n",
       "1775  Passage: Doula, Cameroon (CNN) -- Gunmen oppos...   \n",
       "1776  Passage: This is the photo taken seconds befor...   \n",
       "\n",
       "                         output  \n",
       "0                    anonymised  \n",
       "1                       tactics  \n",
       "2                      Mr. Amin  \n",
       "3                         Pluto  \n",
       "4              leg-spin bowling  \n",
       "...                         ...  \n",
       "1772  Four hundred and thirteen  \n",
       "1773                Casey Kasem  \n",
       "1774                Dwyane Wade  \n",
       "1775        President Paul Biya  \n",
       "1776              Chantal Beyer  \n",
       "\n",
       "[1777 rows x 2 columns]"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2 = pd.DataFrame({'input': ip_list,'output':op_list})\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "ed0e4bb9",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-20T21:32:41.394569Z",
     "start_time": "2023-07-20T21:32:41.299999Z"
    }
   },
   "outputs": [],
   "source": [
    "df2.to_csv(\"train.csv\",index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5a5d62d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:root] *",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
