{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "together-national",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-09-13T18:01:16.501672Z",
     "start_time": "2023-09-13T18:01:16.077457Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "atlantic-keyboard",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.stem import PorterStemmer\n",
    "from nltk.tokenize import word_tokenize\n",
    "ps = PorterStemmer()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "compatible-blowing",
   "metadata": {},
   "outputs": [],
   "source": [
    "def do_wic(csv):\n",
    "    \n",
    "    csv['input'] = csv['input'].str.replace(': 1. ',': ').str.replace(': 2. ',': ').str.replace(': 3. ',': ').str.replace(': 4. ',': ')\n",
    "    print(csv['input'])\n",
    "    csv['wds'] = csv['input'].str.split('Word: ').str[1].str.split(' \\n').str[0]\n",
    "    print(csv['wds'])\n",
    "    isp = []\n",
    "    csv['input'] = csv.apply(lambda x: x['input'].replace('['+x['wds'].strip()+']',x['wds'].strip()), axis = 1 )\n",
    "    for i,r in csv.iterrows():\n",
    "#         csv['input'].iloc[i] = csv['input'].iloc[i].replace(f'\\[{r[\"wds\"]}\\]',r['wds'])\n",
    "        pi = [ps.stem(token) for token in word_tokenize(r['input'])]\n",
    "        pi2 = ps.stem(word_tokenize(r['wds'].strip())[0])\n",
    "        if(pi2 in pi):\n",
    "            isp.append(True)\n",
    "        else:\n",
    "            isp.append(False)\n",
    "        \n",
    "    csv['is_present'] = isp\n",
    "    print('----------------')\n",
    "    csv = csv[csv.is_present==True]\n",
    "    csv = csv[['input','output']]\n",
    "    print(csv)    \n",
    "    \n",
    "    return csv\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "comparable-dancing",
   "metadata": {},
   "outputs": [],
   "source": [
    "def do_multirc(csv):\n",
    "    csv['input'] = csv['input'].str.replace('Question: Question', 'Question:')\n",
    "    \n",
    "   \n",
    "    return csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "operational-southwest",
   "metadata": {},
   "outputs": [],
   "source": [
    "def change_instr_wic(csv):\n",
    "    \n",
    "    csv['input'] = (['Passage: ']*csv.shape[0])+csv['input'].str.split('Passage: ').str[-1]\n",
    "    \n",
    "    passage2 = \"Question: Which organisms demonstrate bioluminescence?\\nOption:['A) Glowing plankton','B) Deep-sea creatures','C) Terrestrial environments','D) Jellyfish','E) Land-based organisms']\\nOutput: ['A','B','D']\\n\"\n",
    "    \n",
    "    csv['input'] = ([passage2]*csv.shape[0])+csv['input']\n",
    "    return csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "indie-arrangement",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = 'multirc'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "accomplished-application",
   "metadata": {},
   "outputs": [],
   "source": [
    "fles = os.listdir(f'{dataset}/synthetic')+os.listdir(f'{dataset}/synthetic_w_inst')\n",
    "flds = [f'{dataset}/synthetic']*len(os.listdir(f'{dataset}/synthetic')) + [f'{dataset}/synthetic_w_inst']*len(os.listdir(f'{dataset}/synthetic_w_inst'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "infinite-hollywood",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                  input                output\n",
      "0     Passage: Climate change has become an increasi...       ['A', 'B', 'C']\n",
      "1     Passage: The urgency to address climate change...            ['A', 'C']\n",
      "2     Passage: Industries, particularly those with h...  ['A', 'B', 'C', 'D']\n",
      "3     Passage: The rising global concerns over clima...       ['A', 'B', 'C']\n",
      "4     Passage: In a groundbreaking development, scie...  ['A', 'B', 'C', 'E']\n",
      "...                                                 ...                   ...\n",
      "4635  Passage: The artistic fusion of traditional te...       ['C', 'B', 'E']\n",
      "4636  Passage: One way in which artists are blending...       ['A', 'B', 'D']\n",
      "4637  Passage: Digital media has also opened up new ...  ['A', 'B', 'C', 'D']\n",
      "4638  Passage: Traditional printmaking techniques, s...            ['A', 'B']\n",
      "4639  Passage: Finally, the fusion of traditional te...  ['A', 'B', 'C', 'D']\n",
      "\n",
      "[4640 rows x 2 columns]\n",
      "-----------------------\n",
      "0       Passage: Climate change has become an increasi...\n",
      "1       Passage: The urgency to address climate change...\n",
      "2       Passage: Industries, particularly those with h...\n",
      "3       Passage: The rising global concerns over clima...\n",
      "4       Passage: In a groundbreaking development, scie...\n",
      "                              ...                        \n",
      "4635    Passage: The artistic fusion of traditional te...\n",
      "4636    Passage: One way in which artists are blending...\n",
      "4637    Passage: Digital media has also opened up new ...\n",
      "4638    Passage: Traditional printmaking techniques, s...\n",
      "4639    Passage: Finally, the fusion of traditional te...\n",
      "Name: input, Length: 4640, dtype: object\n",
      "                                                 input                output\n",
      "0    Passage: What causes a change in motion? The a...            ['B', 'C']\n",
      "1    Passage: What causes a change in motion? The a...            ['C', 'D']\n",
      "2    Passage: What causes a change in motion? The a...                 ['D']\n",
      "3    Passage: What causes a change in motion? The a...  ['B', 'D', 'E', 'F']\n",
      "4    Passage: The film opens with Sunita , a medica...       ['D', 'E', 'F']\n",
      "..                                                 ...                   ...\n",
      "948  Passage: The bar was manned by an expensive hu...                 ['A']\n",
      "949  Passage: The bar was manned by an expensive hu...                 ['A']\n",
      "950  Passage: The bar was manned by an expensive hu...       ['A', 'B', 'C']\n",
      "951  Passage: The bar was manned by an expensive hu...       ['B', 'C', 'D']\n",
      "952  Passage: The bar was manned by an expensive hu...       ['A', 'B', 'D']\n",
      "\n",
      "[953 rows x 2 columns]\n",
      "-----------------------\n",
      "0      Passage: What causes a change in motion? The a...\n",
      "1      Passage: What causes a change in motion? The a...\n",
      "2      Passage: What causes a change in motion? The a...\n",
      "3      Passage: What causes a change in motion? The a...\n",
      "4      Passage: The film opens with Sunita , a medica...\n",
      "                             ...                        \n",
      "948    Passage: The bar was manned by an expensive hu...\n",
      "949    Passage: The bar was manned by an expensive hu...\n",
      "950    Passage: The bar was manned by an expensive hu...\n",
      "951    Passage: The bar was manned by an expensive hu...\n",
      "952    Passage: The bar was manned by an expensive hu...\n",
      "Name: input, Length: 953, dtype: object\n",
      "                                                  input                output\n",
      "0     You are given a passage followed by a question...       ['A', 'B', 'C']\n",
      "1     You are given a passage followed by a question...            ['A', 'C']\n",
      "2     You are given a passage followed by a question...  ['A', 'B', 'C', 'D']\n",
      "3     You are given a passage followed by a question...       ['A', 'B', 'C']\n",
      "4     You are given a passage followed by a question...  ['A', 'B', 'C', 'E']\n",
      "...                                                 ...                   ...\n",
      "4635  You are given a passage followed by a question...       ['C', 'B', 'E']\n",
      "4636  You are given a passage followed by a question...       ['A', 'B', 'D']\n",
      "4637  You are given a passage followed by a question...  ['A', 'B', 'C', 'D']\n",
      "4638  You are given a passage followed by a question...            ['A', 'B']\n",
      "4639  You are given a passage followed by a question...  ['A', 'B', 'C', 'D']\n",
      "\n",
      "[4640 rows x 2 columns]\n",
      "-----------------------\n",
      "0       You are given a passage followed by a question...\n",
      "1       You are given a passage followed by a question...\n",
      "2       You are given a passage followed by a question...\n",
      "3       You are given a passage followed by a question...\n",
      "4       You are given a passage followed by a question...\n",
      "                              ...                        \n",
      "4635    You are given a passage followed by a question...\n",
      "4636    You are given a passage followed by a question...\n",
      "4637    You are given a passage followed by a question...\n",
      "4638    You are given a passage followed by a question...\n",
      "4639    You are given a passage followed by a question...\n",
      "Name: input, Length: 4640, dtype: object\n",
      "                                                 input                output\n",
      "0    You are given a passage followed by a question...            ['B', 'C']\n",
      "1    You are given a passage followed by a question...            ['C', 'D']\n",
      "2    You are given a passage followed by a question...                 ['D']\n",
      "3    You are given a passage followed by a question...  ['B', 'D', 'E', 'F']\n",
      "4    You are given a passage followed by a question...       ['D', 'E', 'F']\n",
      "..                                                 ...                   ...\n",
      "948  You are given a passage followed by a question...                 ['A']\n",
      "949  You are given a passage followed by a question...                 ['A']\n",
      "950  You are given a passage followed by a question...       ['A', 'B', 'C']\n",
      "951  You are given a passage followed by a question...       ['B', 'C', 'D']\n",
      "952  You are given a passage followed by a question...       ['A', 'B', 'D']\n",
      "\n",
      "[953 rows x 2 columns]\n",
      "-----------------------\n",
      "0      You are given a passage followed by a question...\n",
      "1      You are given a passage followed by a question...\n",
      "2      You are given a passage followed by a question...\n",
      "3      You are given a passage followed by a question...\n",
      "4      You are given a passage followed by a question...\n",
      "                             ...                        \n",
      "948    You are given a passage followed by a question...\n",
      "949    You are given a passage followed by a question...\n",
      "950    You are given a passage followed by a question...\n",
      "951    You are given a passage followed by a question...\n",
      "952    You are given a passage followed by a question...\n",
      "Name: input, Length: 953, dtype: object\n"
     ]
    }
   ],
   "source": [
    "for folder,file in zip(flds,fles):\n",
    "    if('.csv' not in file or 'stest' in file or '_clean' in file):\n",
    "        continue\n",
    "    pth = os.path.join(folder,file)\n",
    "    csv = pd.read_csv(pth)\n",
    "    csv = csv.dropna()\n",
    "    print(csv)\n",
    "    print('-----------------------')\n",
    "    print(csv['input'])\n",
    "    \n",
    "    if('wic'in dataset):\n",
    "        csv = do_wic(csv)\n",
    "        \n",
    "    if('multirc' in dataset):\n",
    "        csv = do_multirc(csv)\n",
    "        if('w_inst' in folder):\n",
    "            csv = change_instr_wic(csv)\n",
    "    \n",
    "    csv.to_csv(os.path.join(folder,file.replace('.csv','_clean.csv')),index=False)\n",
    "    \n",
    "#     break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "spatial-cleveland",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "thorough-birmingham",
   "metadata": {},
   "outputs": [],
   "source": [
    "# st = 'Word: presence \\nSentence 1: The artist\\'s work'\n",
    "# pi = [ps.stem(token) for token in word_tokenize(st)]\n",
    "# pi2 = ps.stem(word_tokenize('artistic')[0])\n",
    "# print(pi,pi2)\n",
    "# print(pi2 in pi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "sacred-carpet",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-09-13T18:01:43.429412Z",
     "start_time": "2023-09-13T18:01:43.390410Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>input</th>\n",
       "      <th>output</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>The barista was friendly and greeted the custo...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>The waiter was very attentive to the diner bec...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The chef was thrilled to collaborate with the ...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>The cafe owner is always happy to see the regu...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The baker made a delicious cake and gave it to...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>565</th>\n",
       "      <td>The surfing enthusiasts were eager to learn fr...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>566</th>\n",
       "      <td>The vacationing families enjoyed their time at...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>567</th>\n",
       "      <td>The defendants are nervous about the trial, as...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>568</th>\n",
       "      <td>The witnesses were terrified of the bailiffs a...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>569</th>\n",
       "      <td>The prosecutors present the evidence in court ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>570 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 input  output\n",
       "0    The barista was friendly and greeted the custo...   False\n",
       "1    The waiter was very attentive to the diner bec...   False\n",
       "2    The chef was thrilled to collaborate with the ...   False\n",
       "3    The cafe owner is always happy to see the regu...   False\n",
       "4    The baker made a delicious cake and gave it to...   False\n",
       "..                                                 ...     ...\n",
       "565  The surfing enthusiasts were eager to learn fr...    True\n",
       "566  The vacationing families enjoyed their time at...    True\n",
       "567  The defendants are nervous about the trial, as...    True\n",
       "568  The witnesses were terrified of the bailiffs a...    True\n",
       "569  The prosecutors present the evidence in court ...    True\n",
       "\n",
       "[570 rows x 2 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\"wsc/synthetic/strain.csv\")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d04e8535",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-09-13T18:02:22.931865Z",
     "start_time": "2023-09-13T18:02:22.925473Z"
    }
   },
   "outputs": [],
   "source": [
    "prompt = '''The given input talks about 2 noun phrases (Subject 1 and Subject 2). You are given Subject 1 and a Pronoun that occurs in the input. \n",
    "Rules:\n",
    "1. If the pronoun refers to the noun phrase Subject 1, output LABEL: TRUE. \n",
    "2. If the pronoun refers to the noun phrase Subject 2, output LABEL: FALSE. \n",
    "Explain your reasoning.\n",
    "\n",
    "Example: \n",
    "Input: The city councilmen refused the demonstrators a permit because they feared violence. Subject 1: The city councilmen. Pronoun: They.\n",
    "Output: TRUE\n",
    "Explanation:\n",
    "Noun phrases: city councilmen (Subject 1) ; demonstrators (Subject 2)\n",
    "The pronoun “they” occurs in the phrase “they feared violence”. Out of the 2 subjects, demonstrators are more likely to commit violence, and city councilmen are more likely to fear the violence. This phrase must be talking of the city councilmen (Subject 1). This follows Rule 1.\n",
    "\n",
    "\n",
    "Example: \n",
    "Input: The scientist studied the lion because he was paid to do so. Subject 1: lion. Pronoun: he\n",
    "Output: FALSE\n",
    "Explanation:\n",
    "Noun phrases: lion (Subject 1) ; scientist (Subject 2)\n",
    "The pronoun “he” occurs in the phrase “he was paid to do so”. This is unlikely to refer to the lion, as the scientist (Subject 2) is more likely to be paid to study something. Thus, the pronoun must refer to Subject 2.\n",
    "\n",
    "Input:\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "89ac52a2",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-09-13T18:02:42.155130Z",
     "start_time": "2023-09-13T18:02:42.009844Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>input</th>\n",
       "      <th>output</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>The given input talks about 2 noun phrases (Su...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>The given input talks about 2 noun phrases (Su...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The given input talks about 2 noun phrases (Su...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>The given input talks about 2 noun phrases (Su...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The given input talks about 2 noun phrases (Su...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>565</th>\n",
       "      <td>The given input talks about 2 noun phrases (Su...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>566</th>\n",
       "      <td>The given input talks about 2 noun phrases (Su...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>567</th>\n",
       "      <td>The given input talks about 2 noun phrases (Su...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>568</th>\n",
       "      <td>The given input talks about 2 noun phrases (Su...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>569</th>\n",
       "      <td>The given input talks about 2 noun phrases (Su...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>570 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 input  output\n",
       "0    The given input talks about 2 noun phrases (Su...   False\n",
       "1    The given input talks about 2 noun phrases (Su...   False\n",
       "2    The given input talks about 2 noun phrases (Su...   False\n",
       "3    The given input talks about 2 noun phrases (Su...   False\n",
       "4    The given input talks about 2 noun phrases (Su...   False\n",
       "..                                                 ...     ...\n",
       "565  The given input talks about 2 noun phrases (Su...    True\n",
       "566  The given input talks about 2 noun phrases (Su...    True\n",
       "567  The given input talks about 2 noun phrases (Su...    True\n",
       "568  The given input talks about 2 noun phrases (Su...    True\n",
       "569  The given input talks about 2 noun phrases (Su...    True\n",
       "\n",
       "[570 rows x 2 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['input'] = prompt + df['input']\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb71f585",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(\"strain_inst.csv\",index = False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
