{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: 'data/binary_classification/train.csv'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m/vista/devel/personal/whroe/development/projects/vista/git/simpletransformers/examples/t5/mt5/data_prep.ipynb Cell 2'\u001b[0m in \u001b[0;36m<cell line: 3>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      <a href='vscode-notebook-cell:/vista/devel/personal/whroe/development/projects/vista/git/simpletransformers/examples/t5/mt5/data_prep.ipynb#ch0000001?line=0'>1</a>\u001b[0m prefix \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mdata/binary_classification/\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m----> <a href='vscode-notebook-cell:/vista/devel/personal/whroe/development/projects/vista/git/simpletransformers/examples/t5/mt5/data_prep.ipynb#ch0000001?line=2'>3</a>\u001b[0m binary_train_df \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39;49mread_csv(prefix \u001b[39m+\u001b[39;49m \u001b[39m'\u001b[39;49m\u001b[39mtrain.csv\u001b[39;49m\u001b[39m'\u001b[39;49m, header\u001b[39m=\u001b[39;49m\u001b[39mNone\u001b[39;49;00m)\n\u001b[1;32m      <a href='vscode-notebook-cell:/vista/devel/personal/whroe/development/projects/vista/git/simpletransformers/examples/t5/mt5/data_prep.ipynb#ch0000001?line=3'>4</a>\u001b[0m binary_train_df\u001b[39m.\u001b[39mhead()\n\u001b[1;32m      <a href='vscode-notebook-cell:/vista/devel/personal/whroe/development/projects/vista/git/simpletransformers/examples/t5/mt5/data_prep.ipynb#ch0000001?line=5'>6</a>\u001b[0m binary_eval_df \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mread_csv(prefix \u001b[39m+\u001b[39m \u001b[39m'\u001b[39m\u001b[39mtest.csv\u001b[39m\u001b[39m'\u001b[39m, header\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m)\n",
      "File \u001b[0;32m/devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/util/_decorators.py:311\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/util/_decorators.py?line=304'>305</a>\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(args) \u001b[39m>\u001b[39m num_allow_args:\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/util/_decorators.py?line=305'>306</a>\u001b[0m     warnings\u001b[39m.\u001b[39mwarn(\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/util/_decorators.py?line=306'>307</a>\u001b[0m         msg\u001b[39m.\u001b[39mformat(arguments\u001b[39m=\u001b[39marguments),\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/util/_decorators.py?line=307'>308</a>\u001b[0m         \u001b[39mFutureWarning\u001b[39;00m,\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/util/_decorators.py?line=308'>309</a>\u001b[0m         stacklevel\u001b[39m=\u001b[39mstacklevel,\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/util/_decorators.py?line=309'>310</a>\u001b[0m     )\n\u001b[0;32m--> <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/util/_decorators.py?line=310'>311</a>\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
      "File \u001b[0;32m/devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py:680\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=664'>665</a>\u001b[0m kwds_defaults \u001b[39m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=665'>666</a>\u001b[0m     dialect,\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=666'>667</a>\u001b[0m     delimiter,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=675'>676</a>\u001b[0m     defaults\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mdelimiter\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39m,\u001b[39m\u001b[39m\"\u001b[39m},\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=676'>677</a>\u001b[0m )\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=677'>678</a>\u001b[0m kwds\u001b[39m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=679'>680</a>\u001b[0m \u001b[39mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n",
      "File \u001b[0;32m/devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py:575\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=571'>572</a>\u001b[0m _validate_names(kwds\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mnames\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m))\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=573'>574</a>\u001b[0m \u001b[39m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=574'>575</a>\u001b[0m parser \u001b[39m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwds)\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=576'>577</a>\u001b[0m \u001b[39mif\u001b[39;00m chunksize \u001b[39mor\u001b[39;00m iterator:\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=577'>578</a>\u001b[0m     \u001b[39mreturn\u001b[39;00m parser\n",
      "File \u001b[0;32m/devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py:933\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=929'>930</a>\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moptions[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m kwds[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=931'>932</a>\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles: IOHandles \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m--> <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=932'>933</a>\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_engine \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_engine(f, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mengine)\n",
      "File \u001b[0;32m/devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1217\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m   <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1212'>1213</a>\u001b[0m     mode \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m   <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1213'>1214</a>\u001b[0m \u001b[39m# error: No overload variant of \"get_handle\" matches argument types\u001b[39;00m\n\u001b[1;32m   <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1214'>1215</a>\u001b[0m \u001b[39m# \"Union[str, PathLike[str], ReadCsvBuffer[bytes], ReadCsvBuffer[str]]\"\u001b[39;00m\n\u001b[1;32m   <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1215'>1216</a>\u001b[0m \u001b[39m# , \"str\", \"bool\", \"Any\", \"Any\", \"Any\", \"Any\", \"Any\"\u001b[39;00m\n\u001b[0;32m-> <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1216'>1217</a>\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39m=\u001b[39m get_handle(  \u001b[39m# type: ignore[call-overload]\u001b[39;49;00m\n\u001b[1;32m   <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1217'>1218</a>\u001b[0m     f,\n\u001b[1;32m   <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1218'>1219</a>\u001b[0m     mode,\n\u001b[1;32m   <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1219'>1220</a>\u001b[0m     encoding\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[1;32m   <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1220'>1221</a>\u001b[0m     compression\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mcompression\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[1;32m   <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1221'>1222</a>\u001b[0m     memory_map\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mmemory_map\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mFalse\u001b[39;49;00m),\n\u001b[1;32m   <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1222'>1223</a>\u001b[0m     is_text\u001b[39m=\u001b[39;49mis_text,\n\u001b[1;32m   <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1223'>1224</a>\u001b[0m     errors\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding_errors\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mstrict\u001b[39;49m\u001b[39m\"\u001b[39;49m),\n\u001b[1;32m   <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1224'>1225</a>\u001b[0m     storage_options\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mstorage_options\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[1;32m   <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1225'>1226</a>\u001b[0m )\n\u001b[1;32m   <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1226'>1227</a>\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m   <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/parsers/readers.py?line=1227'>1228</a>\u001b[0m f \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles\u001b[39m.\u001b[39mhandle\n",
      "File \u001b[0;32m/devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py:789\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py?line=783'>784</a>\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(handle, \u001b[39mstr\u001b[39m):\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py?line=784'>785</a>\u001b[0m     \u001b[39m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py?line=785'>786</a>\u001b[0m     \u001b[39m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py?line=786'>787</a>\u001b[0m     \u001b[39mif\u001b[39;00m ioargs\u001b[39m.\u001b[39mencoding \u001b[39mand\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m ioargs\u001b[39m.\u001b[39mmode:\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py?line=787'>788</a>\u001b[0m         \u001b[39m# Encoding\u001b[39;00m\n\u001b[0;32m--> <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py?line=788'>789</a>\u001b[0m         handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39;49m(\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py?line=789'>790</a>\u001b[0m             handle,\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py?line=790'>791</a>\u001b[0m             ioargs\u001b[39m.\u001b[39;49mmode,\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py?line=791'>792</a>\u001b[0m             encoding\u001b[39m=\u001b[39;49mioargs\u001b[39m.\u001b[39;49mencoding,\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py?line=792'>793</a>\u001b[0m             errors\u001b[39m=\u001b[39;49merrors,\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py?line=793'>794</a>\u001b[0m             newline\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py?line=794'>795</a>\u001b[0m         )\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py?line=795'>796</a>\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py?line=796'>797</a>\u001b[0m         \u001b[39m# Binary mode\u001b[39;00m\n\u001b[1;32m    <a href='file:///devel/personal/whroe/envs/st/lib/python3.9/site-packages/pandas/io/common.py?line=797'>798</a>\u001b[0m         handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39m(handle, ioargs\u001b[39m.\u001b[39mmode)\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/binary_classification/train.csv'"
     ]
    }
   ],
   "source": [
    "prefix = 'data/binary_classification/'\n",
    "\n",
    "binary_train_df = pd.read_csv(prefix + 'train.csv', header=None)\n",
    "binary_train_df.head()\n",
    "\n",
    "binary_eval_df = pd.read_csv(prefix + 'test.csv', header=None)\n",
    "binary_eval_df.head()\n",
    "\n",
    "binary_train_df[0] = (binary_train_df[0] == 2).astype(int)\n",
    "binary_eval_df[0] = (binary_eval_df[0] == 2).astype(int)\n",
    "\n",
    "binary_train_df = pd.DataFrame({\n",
    "    'prefix': [\"binary classification\" for i in range(len(binary_train_df))],\n",
    "    'input_text': binary_train_df[1].str.replace('\\n', ' '),\n",
    "    'target_text': binary_train_df[0].astype(str),\n",
    "})\n",
    "\n",
    "print(binary_train_df.head())\n",
    "\n",
    "binary_eval_df = pd.DataFrame({\n",
    "    'prefix': [\"binary classification\" for i in range(len(binary_eval_df))],\n",
    "    'input_text': binary_eval_df[1].str.replace('\\n', ' '),\n",
    "    'target_text': binary_eval_df[0].astype(str),\n",
    "})\n",
    "\n",
    "\n",
    "print(binary_eval_df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>prefix</th>\n",
       "      <th>input_text</th>\n",
       "      <th>target_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>140162</th>\n",
       "      <td>multilabel classification</td>\n",
       "      <td>ban   you got me banned on irc -</td>\n",
       "      <td>clean</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>135151</th>\n",
       "      <td>multilabel classification</td>\n",
       "      <td>This is a public computer   Hi, I have a sligh...</td>\n",
       "      <td>clean</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4901</th>\n",
       "      <td>multilabel classification</td>\n",
       "      <td>Why does nobody post anything on 'my talk' tha...</td>\n",
       "      <td>clean</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58298</th>\n",
       "      <td>multilabel classification</td>\n",
       "      <td>Okay sorry I didn't read the article for a while.</td>\n",
       "      <td>clean</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>56472</th>\n",
       "      <td>multilabel classification</td>\n",
       "      <td>If you really feel that strongly about protect...</td>\n",
       "      <td>clean</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           prefix  \\\n",
       "140162  multilabel classification   \n",
       "135151  multilabel classification   \n",
       "4901    multilabel classification   \n",
       "58298   multilabel classification   \n",
       "56472   multilabel classification   \n",
       "\n",
       "                                               input_text target_text  \n",
       "140162                   ban   you got me banned on irc -       clean  \n",
       "135151  This is a public computer   Hi, I have a sligh...       clean  \n",
       "4901    Why does nobody post anything on 'my talk' tha...       clean  \n",
       "58298   Okay sorry I didn't read the article for a while.       clean  \n",
       "56472   If you really feel that strongly about protect...       clean  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prefix = \"data/multilabel_classification/\"\n",
    "\n",
    "multi_train_df = pd.read_csv(prefix + 'train.csv')\n",
    "multi_train_df[\"comment_text\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
    "\n",
    "for col in multi_train_df.columns:\n",
    "    if col not in [\"id\", \"comment_text\"]:\n",
    "        multi_train_df[col] = multi_train_df[col].apply(lambda x: col if x else \"\")\n",
    "\n",
    "multi_train_df[\"target_text\"] = multi_train_df['toxic'].str.cat(multi_train_df[[col for col in multi_train_df.columns if col not in [\"id\", \"comment_text\", \"toxic\"]]], sep=',')\n",
    "multi_train_df[\"target_text\"] = multi_train_df[\"target_text\"].apply(lambda x: \",\".join(word for word in x.split(\",\") if word)).apply(lambda x: x if x else \"clean\")\n",
    "multi_train_df[\"input_text\"] = multi_train_df[\"comment_text\"].str.replace('\\n', ' ')\n",
    "multi_train_df[\"prefix\"] = \"multilabel classification\"\n",
    "multi_train_df = multi_train_df[[\"prefix\", \"input_text\", \"target_text\"]]\n",
    "\n",
    "multi_train_df, multi_eval_df = train_test_split(multi_train_df, test_size=0.1)\n",
    "\n",
    "multi_train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "b'Skipping line 2509: expected 10 fields, saw 11\\nSkipping line 2650: expected 10 fields, saw 11\\nSkipping line 2727: expected 10 fields, saw 11\\nSkipping line 3071: expected 10 fields, saw 11\\nSkipping line 3393: expected 10 fields, saw 11\\n'\n",
      "b'Skipping line 1042: expected 10 fields, saw 11\\nSkipping line 1066: expected 10 fields, saw 11\\nSkipping line 1083: expected 10 fields, saw 11\\nSkipping line 1137: expected 10 fields, saw 11\\nSkipping line 1150: expected 10 fields, saw 11\\n'\n"
     ]
    }
   ],
   "source": [
    "prefix = 'data/regression/'\n",
    "\n",
    "sts_train_df = pd.read_csv(prefix + 'train.tsv', sep='\\t', error_bad_lines=False).dropna()\n",
    "sts_eval_df = pd.read_csv(prefix + 'dev.tsv', sep='\\t', error_bad_lines=False).dropna()\n",
    "\n",
    "sts_train_df[\"sentence1\"] = sts_train_df[\"sentence1\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
    "sts_train_df[\"sentence2\"] = sts_train_df[\"sentence2\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
    "sts_eval_df[\"sentence1\"] = sts_eval_df[\"sentence1\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
    "sts_eval_df[\"sentence2\"] = sts_eval_df[\"sentence2\"].str.replace('\\n', ' ').str.replace('\\t', ' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sts_train_df.drop(2001, inplace=True) # This line badly formatted. Getting rid."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sts_train_df[\"input_text\"] = sts_train_df.apply(lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1)\n",
    "sts_eval_df[\"input_text\"] = sts_eval_df.apply(lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1)\n",
    "\n",
    "sts_train_df[\"target_text\"] = sts_train_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
    "sts_eval_df[\"target_text\"] = sts_eval_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
    "\n",
    "sts_train_df[\"prefix\"] = \"similarity\"\n",
    "sts_eval_df[\"prefix\"] = \"similarity\"\n",
    "\n",
    "sts_train_df = sts_train_df[[\"prefix\", \"input_text\", \"target_text\"]]\n",
    "sts_eval_df = sts_eval_df[[\"prefix\", \"input_text\", \"target_text\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.concat([binary_train_df, multi_train_df, sts_train_df]).astype(str)\n",
    "eval_df = pd.concat([binary_eval_df, multi_eval_df, sts_eval_df]).astype(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.to_csv(\"data/train.tsv\", \"\\t\")\n",
    "eval_df.to_csv(\"data/eval.tsv\", \"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "f5e825c258dabef721a4c4eee8f1ed05466e2655bf4b920138ac685c780c71b8"
  },
  "kernelspec": {
   "display_name": "Python 3.9.12",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
