{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "prefix                                         input_text  \\\n",
      "0  binary classification  Unfortunately, the frustration of being Dr. Go...   \n",
      "1  binary classification  Been going to Dr. Goldberg for over 10 years. ...   \n",
      "2  binary classification  I don't know what Dr. Goldberg was like before...   \n",
      "3  binary classification  I'm writing this review to give you a heads up...   \n",
      "4  binary classification  All the food is great here. But the best thing...   \n",
      "\n",
      "  target_text  \n",
      "0           0  \n",
      "1           1  \n",
      "2           0  \n",
      "3           0  \n",
      "4           1  \n",
      "                  prefix                                         input_text  \\\n",
      "0  binary classification  Contrary to other reviews, I have zero complai...   \n",
      "1  binary classification  Last summer I had an appointment to get new ti...   \n",
      "2  binary classification  Friendly staff, same starbucks fair you get an...   \n",
      "3  binary classification  The food is good. Unfortunately the service is...   \n",
      "4  binary classification  Even when we didn't have a car Filene's Baseme...   \n",
      "\n",
      "  target_text  \n",
      "0           1  \n",
      "1           0  \n",
      "2           1  \n",
      "3           0  \n",
      "4           1  \n"
     ]
    }
   ],
   "source": [
    "prefix = 'data/binary_classification/'\n",
    "\n",
    "binary_train_df = pd.read_csv(prefix + 'train.csv', header=None)\n",
    "binary_train_df.head()\n",
    "\n",
    "binary_eval_df = pd.read_csv(prefix + 'test.csv', header=None)\n",
    "binary_eval_df.head()\n",
    "\n",
    "binary_train_df[0] = (binary_train_df[0] == 2).astype(int)\n",
    "binary_eval_df[0] = (binary_eval_df[0] == 2).astype(int)\n",
    "\n",
    "binary_train_df = pd.DataFrame({\n",
    "    'prefix': [\"binary classification\" for i in range(len(binary_train_df))],\n",
    "    'input_text': binary_train_df[1].str.replace('\\n', ' '),\n",
    "    'target_text': binary_train_df[0].astype(str),\n",
    "})\n",
    "\n",
    "print(binary_train_df.head())\n",
    "\n",
    "binary_eval_df = pd.DataFrame({\n",
    "    'prefix': [\"binary classification\" for i in range(len(binary_eval_df))],\n",
    "    'input_text': binary_eval_df[1].str.replace('\\n', ' '),\n",
    "    'target_text': binary_eval_df[0].astype(str),\n",
    "})\n",
    "\n",
    "\n",
    "print(binary_eval_df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>prefix</th>\n",
       "      <th>input_text</th>\n",
       "      <th>target_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>140162</th>\n",
       "      <td>multilabel classification</td>\n",
       "      <td>ban   you got me banned on irc -</td>\n",
       "      <td>clean</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>135151</th>\n",
       "      <td>multilabel classification</td>\n",
       "      <td>This is a public computer   Hi, I have a sligh...</td>\n",
       "      <td>clean</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4901</th>\n",
       "      <td>multilabel classification</td>\n",
       "      <td>Why does nobody post anything on 'my talk' tha...</td>\n",
       "      <td>clean</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58298</th>\n",
       "      <td>multilabel classification</td>\n",
       "      <td>Okay sorry I didn't read the article for a while.</td>\n",
       "      <td>clean</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>56472</th>\n",
       "      <td>multilabel classification</td>\n",
       "      <td>If you really feel that strongly about protect...</td>\n",
       "      <td>clean</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           prefix  \\\n",
       "140162  multilabel classification   \n",
       "135151  multilabel classification   \n",
       "4901    multilabel classification   \n",
       "58298   multilabel classification   \n",
       "56472   multilabel classification   \n",
       "\n",
       "                                               input_text target_text  \n",
       "140162                   ban   you got me banned on irc -       clean  \n",
       "135151  This is a public computer   Hi, I have a sligh...       clean  \n",
       "4901    Why does nobody post anything on 'my talk' tha...       clean  \n",
       "58298   Okay sorry I didn't read the article for a while.       clean  \n",
       "56472   If you really feel that strongly about protect...       clean  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prefix = \"data/multilabel_classification/\"\n",
    "\n",
    "multi_train_df = pd.read_csv(prefix + 'train.csv')\n",
    "multi_train_df[\"comment_text\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
    "\n",
    "for col in multi_train_df.columns:\n",
    "    if col not in [\"id\", \"comment_text\"]:\n",
    "        multi_train_df[col] = multi_train_df[col].apply(lambda x: col if x else \"\")\n",
    "\n",
    "multi_train_df[\"target_text\"] = multi_train_df['toxic'].str.cat(multi_train_df[[col for col in multi_train_df.columns if col not in [\"id\", \"comment_text\", \"toxic\"]]], sep=',')\n",
    "multi_train_df[\"target_text\"] = multi_train_df[\"target_text\"].apply(lambda x: \",\".join(word for word in x.split(\",\") if word)).apply(lambda x: x if x else \"clean\")\n",
    "multi_train_df[\"input_text\"] = multi_train_df[\"comment_text\"].str.replace('\\n', ' ')\n",
    "multi_train_df[\"prefix\"] = \"multilabel classification\"\n",
    "multi_train_df = multi_train_df[[\"prefix\", \"input_text\", \"target_text\"]]\n",
    "\n",
    "multi_train_df, multi_eval_df = train_test_split(multi_train_df, test_size=0.1)\n",
    "\n",
    "multi_train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "b'Skipping line 2509: expected 10 fields, saw 11\\nSkipping line 2650: expected 10 fields, saw 11\\nSkipping line 2727: expected 10 fields, saw 11\\nSkipping line 3071: expected 10 fields, saw 11\\nSkipping line 3393: expected 10 fields, saw 11\\n'\n",
      "b'Skipping line 1042: expected 10 fields, saw 11\\nSkipping line 1066: expected 10 fields, saw 11\\nSkipping line 1083: expected 10 fields, saw 11\\nSkipping line 1137: expected 10 fields, saw 11\\nSkipping line 1150: expected 10 fields, saw 11\\n'\n"
     ]
    }
   ],
   "source": [
    "prefix = 'data/regression/'\n",
    "\n",
    "sts_train_df = pd.read_csv(prefix + 'train.tsv', sep='\\t', error_bad_lines=False).dropna()\n",
    "sts_eval_df = pd.read_csv(prefix + 'dev.tsv', sep='\\t', error_bad_lines=False).dropna()\n",
    "\n",
    "sts_train_df[\"sentence1\"] = sts_train_df[\"sentence1\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
    "sts_train_df[\"sentence2\"] = sts_train_df[\"sentence2\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
    "sts_eval_df[\"sentence1\"] = sts_eval_df[\"sentence1\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
    "sts_eval_df[\"sentence2\"] = sts_eval_df[\"sentence2\"].str.replace('\\n', ' ').str.replace('\\t', ' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "sts_train_df.drop(2001, inplace=True) # This line badly formatted. Getting rid."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "sts_train_df[\"input_text\"] = sts_train_df.apply(lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1)\n",
    "sts_eval_df[\"input_text\"] = sts_eval_df.apply(lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1)\n",
    "\n",
    "sts_train_df[\"target_text\"] = sts_train_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
    "sts_eval_df[\"target_text\"] = sts_eval_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
    "\n",
    "sts_train_df[\"prefix\"] = \"similarity\"\n",
    "sts_eval_df[\"prefix\"] = \"similarity\"\n",
    "\n",
    "sts_train_df = sts_train_df[[\"prefix\", \"input_text\", \"target_text\"]]\n",
    "sts_eval_df = sts_eval_df[[\"prefix\", \"input_text\", \"target_text\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.concat([binary_train_df, multi_train_df, sts_train_df]).astype(str)\n",
    "eval_df = pd.concat([binary_eval_df, multi_eval_df, sts_eval_df]).astype(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.to_csv(\"data/train.tsv\", \"\\t\")\n",
    "eval_df.to_csv(\"data/eval.tsv\", \"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "f5e825c258dabef721a4c4eee8f1ed05466e2655bf4b920138ac685c780c71b8"
  },
  "kernelspec": {
   "display_name": "Python 3.9.12",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
