{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "together-national",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "atlantic-keyboard",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.stem import PorterStemmer\n",
    "from nltk.tokenize import word_tokenize\n",
    "ps = PorterStemmer()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "compatible-blowing",
   "metadata": {},
   "outputs": [],
   "source": [
    "def do_wic(csv):\n",
    "    \n",
    "    csv['input'] = csv['input'].str.replace(': 1. ',': ').str.replace(': 2. ',': ').str.replace(': 3. ',': ').str.replace(': 4. ',': ')\n",
    "    print(csv['input'])\n",
    "    csv['wds'] = csv['input'].str.split('Word: ').str[1].str.split(' \\n').str[0]\n",
    "    print(csv['wds'])\n",
    "    isp = []\n",
    "    csv['input'] = csv.apply(lambda x: x['input'].replace('['+x['wds'].strip()+']',x['wds'].strip()), axis = 1 )\n",
    "    for i,r in csv.iterrows():\n",
    "#         csv['input'].iloc[i] = csv['input'].iloc[i].replace(f'\\[{r[\"wds\"]}\\]',r['wds'])\n",
    "        pi = [ps.stem(token) for token in word_tokenize(r['input'])]\n",
    "        pi2 = ps.stem(word_tokenize(r['wds'].strip())[0])\n",
    "        if(pi2 in pi):\n",
    "            isp.append(True)\n",
    "        else:\n",
    "            isp.append(False)\n",
    "        \n",
    "    csv['is_present'] = isp\n",
    "    print('----------------')\n",
    "    csv = csv[csv.is_present==True]\n",
    "    csv = csv[['input','output']]\n",
    "    print(csv)    \n",
    "    \n",
    "    return csv\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "comparable-dancing",
   "metadata": {},
   "outputs": [],
   "source": [
    "def do_multirc(csv):\n",
    "    csv['input'] = csv['input'].str.replace('Question: Question', 'Question:')\n",
    "    \n",
    "   \n",
    "    return csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "operational-southwest",
   "metadata": {},
   "outputs": [],
   "source": [
    "def change_instr_wic(csv):\n",
    "    \n",
    "    csv['input'] = (['Passage: ']*csv.shape[0])+csv['input'].str.split('Passage: ').str[-1]\n",
    "    \n",
    "    passage2 = \"Question: Which organisms demonstrate bioluminescence?\\nOption:['A) Glowing plankton','B) Deep-sea creatures','C) Terrestrial environments','D) Jellyfish','E) Land-based organisms']\\nOutput: ['A','B','D']\\n\"\n",
    "    \n",
    "    csv['input'] = ([passage2]*csv.shape[0])+csv['input']\n",
    "    return csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "indie-arrangement",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = 'multirc'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "accomplished-application",
   "metadata": {},
   "outputs": [],
   "source": [
    "fles = os.listdir(f'{dataset}/synthetic')+os.listdir(f'{dataset}/synthetic_w_inst')\n",
    "flds = [f'{dataset}/synthetic']*len(os.listdir(f'{dataset}/synthetic')) + [f'{dataset}/synthetic_w_inst']*len(os.listdir(f'{dataset}/synthetic_w_inst'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "infinite-hollywood",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                  input                output\n",
      "0     Passage: Climate change has become an increasi...       ['A', 'B', 'C']\n",
      "1     Passage: The urgency to address climate change...            ['A', 'C']\n",
      "2     Passage: Industries, particularly those with h...  ['A', 'B', 'C', 'D']\n",
      "3     Passage: The rising global concerns over clima...       ['A', 'B', 'C']\n",
      "4     Passage: In a groundbreaking development, scie...  ['A', 'B', 'C', 'E']\n",
      "...                                                 ...                   ...\n",
      "4635  Passage: The artistic fusion of traditional te...       ['C', 'B', 'E']\n",
      "4636  Passage: One way in which artists are blending...       ['A', 'B', 'D']\n",
      "4637  Passage: Digital media has also opened up new ...  ['A', 'B', 'C', 'D']\n",
      "4638  Passage: Traditional printmaking techniques, s...            ['A', 'B']\n",
      "4639  Passage: Finally, the fusion of traditional te...  ['A', 'B', 'C', 'D']\n",
      "\n",
      "[4640 rows x 2 columns]\n",
      "-----------------------\n",
      "0       Passage: Climate change has become an increasi...\n",
      "1       Passage: The urgency to address climate change...\n",
      "2       Passage: Industries, particularly those with h...\n",
      "3       Passage: The rising global concerns over clima...\n",
      "4       Passage: In a groundbreaking development, scie...\n",
      "                              ...                        \n",
      "4635    Passage: The artistic fusion of traditional te...\n",
      "4636    Passage: One way in which artists are blending...\n",
      "4637    Passage: Digital media has also opened up new ...\n",
      "4638    Passage: Traditional printmaking techniques, s...\n",
      "4639    Passage: Finally, the fusion of traditional te...\n",
      "Name: input, Length: 4640, dtype: object\n",
      "                                                 input                output\n",
      "0    Passage: What causes a change in motion? The a...            ['B', 'C']\n",
      "1    Passage: What causes a change in motion? The a...            ['C', 'D']\n",
      "2    Passage: What causes a change in motion? The a...                 ['D']\n",
      "3    Passage: What causes a change in motion? The a...  ['B', 'D', 'E', 'F']\n",
      "4    Passage: The film opens with Sunita , a medica...       ['D', 'E', 'F']\n",
      "..                                                 ...                   ...\n",
      "948  Passage: The bar was manned by an expensive hu...                 ['A']\n",
      "949  Passage: The bar was manned by an expensive hu...                 ['A']\n",
      "950  Passage: The bar was manned by an expensive hu...       ['A', 'B', 'C']\n",
      "951  Passage: The bar was manned by an expensive hu...       ['B', 'C', 'D']\n",
      "952  Passage: The bar was manned by an expensive hu...       ['A', 'B', 'D']\n",
      "\n",
      "[953 rows x 2 columns]\n",
      "-----------------------\n",
      "0      Passage: What causes a change in motion? The a...\n",
      "1      Passage: What causes a change in motion? The a...\n",
      "2      Passage: What causes a change in motion? The a...\n",
      "3      Passage: What causes a change in motion? The a...\n",
      "4      Passage: The film opens with Sunita , a medica...\n",
      "                             ...                        \n",
      "948    Passage: The bar was manned by an expensive hu...\n",
      "949    Passage: The bar was manned by an expensive hu...\n",
      "950    Passage: The bar was manned by an expensive hu...\n",
      "951    Passage: The bar was manned by an expensive hu...\n",
      "952    Passage: The bar was manned by an expensive hu...\n",
      "Name: input, Length: 953, dtype: object\n",
      "                                                  input                output\n",
      "0     You are given a passage followed by a question...       ['A', 'B', 'C']\n",
      "1     You are given a passage followed by a question...            ['A', 'C']\n",
      "2     You are given a passage followed by a question...  ['A', 'B', 'C', 'D']\n",
      "3     You are given a passage followed by a question...       ['A', 'B', 'C']\n",
      "4     You are given a passage followed by a question...  ['A', 'B', 'C', 'E']\n",
      "...                                                 ...                   ...\n",
      "4635  You are given a passage followed by a question...       ['C', 'B', 'E']\n",
      "4636  You are given a passage followed by a question...       ['A', 'B', 'D']\n",
      "4637  You are given a passage followed by a question...  ['A', 'B', 'C', 'D']\n",
      "4638  You are given a passage followed by a question...            ['A', 'B']\n",
      "4639  You are given a passage followed by a question...  ['A', 'B', 'C', 'D']\n",
      "\n",
      "[4640 rows x 2 columns]\n",
      "-----------------------\n",
      "0       You are given a passage followed by a question...\n",
      "1       You are given a passage followed by a question...\n",
      "2       You are given a passage followed by a question...\n",
      "3       You are given a passage followed by a question...\n",
      "4       You are given a passage followed by a question...\n",
      "                              ...                        \n",
      "4635    You are given a passage followed by a question...\n",
      "4636    You are given a passage followed by a question...\n",
      "4637    You are given a passage followed by a question...\n",
      "4638    You are given a passage followed by a question...\n",
      "4639    You are given a passage followed by a question...\n",
      "Name: input, Length: 4640, dtype: object\n",
      "                                                 input                output\n",
      "0    You are given a passage followed by a question...            ['B', 'C']\n",
      "1    You are given a passage followed by a question...            ['C', 'D']\n",
      "2    You are given a passage followed by a question...                 ['D']\n",
      "3    You are given a passage followed by a question...  ['B', 'D', 'E', 'F']\n",
      "4    You are given a passage followed by a question...       ['D', 'E', 'F']\n",
      "..                                                 ...                   ...\n",
      "948  You are given a passage followed by a question...                 ['A']\n",
      "949  You are given a passage followed by a question...                 ['A']\n",
      "950  You are given a passage followed by a question...       ['A', 'B', 'C']\n",
      "951  You are given a passage followed by a question...       ['B', 'C', 'D']\n",
      "952  You are given a passage followed by a question...       ['A', 'B', 'D']\n",
      "\n",
      "[953 rows x 2 columns]\n",
      "-----------------------\n",
      "0      You are given a passage followed by a question...\n",
      "1      You are given a passage followed by a question...\n",
      "2      You are given a passage followed by a question...\n",
      "3      You are given a passage followed by a question...\n",
      "4      You are given a passage followed by a question...\n",
      "                             ...                        \n",
      "948    You are given a passage followed by a question...\n",
      "949    You are given a passage followed by a question...\n",
      "950    You are given a passage followed by a question...\n",
      "951    You are given a passage followed by a question...\n",
      "952    You are given a passage followed by a question...\n",
      "Name: input, Length: 953, dtype: object\n"
     ]
    }
   ],
   "source": [
    "for folder,file in zip(flds,fles):\n",
    "    if('.csv' not in file or 'stest' in file or '_clean' in file):\n",
    "        continue\n",
    "    pth = os.path.join(folder,file)\n",
    "    csv = pd.read_csv(pth)\n",
    "    csv = csv.dropna()\n",
    "    print(csv)\n",
    "    print('-----------------------')\n",
    "    print(csv['input'])\n",
    "    \n",
    "    if('wic'in dataset):\n",
    "        csv = do_wic(csv)\n",
    "        \n",
    "    if('multirc' in dataset):\n",
    "        csv = do_multirc(csv)\n",
    "        if('w_inst' in folder):\n",
    "            csv = change_instr_wic(csv)\n",
    "    \n",
    "    csv.to_csv(os.path.join(folder,file.replace('.csv','_clean.csv')),index=False)\n",
    "    \n",
    "#     break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "spatial-cleveland",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "thorough-birmingham",
   "metadata": {},
   "outputs": [],
   "source": [
    "# st = 'Word: presence \\nSentence 1: The artist\\'s work'\n",
    "# pi = [ps.stem(token) for token in word_tokenize(st)]\n",
    "# pi2 = ps.stem(word_tokenize('artistic')[0])\n",
    "# print(pi,pi2)\n",
    "# print(pi2 in pi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "sacred-carpet",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
