{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract stimulus texts and original labels\n",
    "Set up `data_dir` and run the following blocks one-by-one to:\n",
    "1. extract texts and labels from `.csv` files;\n",
    "2. revise some known typos.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "data_dir = './data/raw_data'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ZuCo1-task1 (Normal Reading)\n",
    "- Copy all `...ZuCo1/task_materials/xxx.csv` files into a new created dir (e.g., `.../ZuCo1/revised_csv/`).\n",
    "- There are several format errors in some original `.csv` files (e.g., absence of column headers), you may take a few minutes to *manually correct* them according to the ERROR messages."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(400, 5) Index(['raw text', 'dataset', 'task', 'control', 'raw label'], dtype='object')\n",
      "400\n"
     ]
    }
   ],
   "source": [
    "df11_raw = pd.read_csv(data_dir + '/ZuCo1/revised_csv/sentiment_labels_task1.csv', \n",
    "                       sep=';', header=0,  skiprows=[1], encoding='utf-8',\n",
    "                       dtype={'sentence': str, 'control': str, 'sentiment_label':str})\n",
    "# print(df1_raw)\n",
    "# n_row, n_column = df11_raw.shape\n",
    "df11 = df11_raw.rename(columns={'sentence': 'raw text', \n",
    "                            'sentiment_label': 'raw label'})\n",
    "df11 = df11.reindex(columns=['raw text', 'dataset', 'task', 'control', 'raw label',])\n",
    "                      \n",
    "df11['dataset'] =  ['ZuCo1'] * df11.shape[0]  # each item is init as a tuple with len==1 for easy extension\n",
    "df11['task'] =  ['task1'] * df11.shape[0]\n",
    "df11['control'] = df11['control'].apply(lambda x: x == 'CONTROL')\n",
    "print(df11.shape, df11.columns)\n",
    "print(df11['raw text'].nunique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ZuCo1-task2 (Normal Reading)  \n",
    "Note: there are multiple relation labels in some of the sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(300, 5) Index(['raw text', 'dataset', 'task', 'control', 'raw label'], dtype='object')\n",
      "300\n"
     ]
    }
   ],
   "source": [
    "def reformat_relation_types(text):\n",
    "    '''\n",
    "    `VISITED` --> tuple(`VISITED`,)\n",
    "    `AWARD;JOB_TITLE;NATIONALITY` --> tuple(`AWARD`,`JOB_TITLE`,`NATIONALITY`)\n",
    "    `NO-RELATION` --> np.nan\n",
    "    '''\n",
    "    assert isinstance(text, str)\n",
    "    if text == 'NO-RELATION':\n",
    "        text = np.nan\n",
    "    else:\n",
    "        text = tuple(text.split(';'))\n",
    "    return text\n",
    "\n",
    "df12_raw = pd.read_csv(data_dir + '/ZuCo1/revised_csv/relations_labels_task2.csv', \n",
    "                       sep=',', header=0, encoding='utf-8',\n",
    "                       dtype={'sentence': str,'control': str,'relation_types':str})\n",
    "# n_row, n_column = df12_raw.shape\n",
    "df12 = df12_raw.rename(columns={'sentence': 'raw text', \n",
    "                                'relation_types': 'raw label'})\n",
    "df12 = df12.reindex(columns=['raw text', 'dataset', 'task', 'control', 'raw label',])\n",
    "df12['dataset'] =  ['ZuCo1'] * df12.shape[0]\n",
    "df12['task'] =  ['task2'] * df12.shape[0]\n",
    "df12['control'] = df12['control'].apply(lambda x: x == 'CONTROL')\n",
    "df12['raw label'] = df12['raw label'].apply(reformat_relation_types)\n",
    "print(df12.shape, df12.columns)\n",
    "print(df12['raw text'].nunique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ZuCo1-task3 (Task-specific Reading)  \n",
    "Note: there are repeated sentences yet with different relation labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(407, 5) Index(['raw text', 'dataset', 'task', 'control', 'raw label'], dtype='object')\n",
      "386\n"
     ]
    }
   ],
   "source": [
    "def assign_control_with_label(label):\n",
    "    assert label in ['AWARD', 'EDUCATION', 'EMPLOYER', \n",
    "                   'FOUNDER', 'JOB_TITLE', 'NATIONALITY', \n",
    "                   'POLITICAL_AFFILIATION', 'VISITED', 'WIFE',\n",
    "                   'CONTROL']\n",
    "    return True if label == 'CONTROL' else False\n",
    "\n",
    "df13_raw = pd.read_csv(data_dir + '/ZuCo1/revised_csv/relations_labels_task3.csv', \n",
    "                       sep=';', header=0, encoding='utf-8', \n",
    "                       dtype={'sentence': str, 'relation-type':str})\n",
    "df13 = df13_raw.rename(columns={'sentence': 'raw text', \n",
    "                            'relation-type': 'raw label'})\n",
    "df13 = df13.reindex(columns=['raw text', 'dataset', 'task', 'control', 'raw label',])\n",
    "df13['dataset'] =  ['ZuCo1'] * df13.shape[0]\n",
    "df13['task'] =  ['task3'] * df13.shape[0]\n",
    "df13['control'] = df13['raw label'].apply(assign_control_with_label)\n",
    "# df13['label'] = df13['label'].apply(lambda x: x if x!='CONTROL' else np.nan)\n",
    "for i in range(df13.shape[0]):\n",
    "    label = df13.loc[i, 'raw label']\n",
    "    if label == 'CONTROL':\n",
    "        left = df13.loc[i-1, 'raw label']\n",
    "        right = df13.loc[i+1, 'raw label']\n",
    "        assert left == right\n",
    "        df13.loc[i, 'raw label'] = left\n",
    "\n",
    "print(df13.shape, df13.columns)\n",
    "print(df13['raw text'].nunique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ZuCo2-task2 (Normal Reading)  \n",
    "Note: there repeated sentences with unkown labels, we will drop them at the next step"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "370 Index(['raw text', 'dataset', 'task', 'control', 'raw label'], dtype='object')\n",
      "365\n"
     ]
    }
   ],
   "source": [
    "def extract_merge(file_dir, n=1):\n",
    "    sentence_path = file_dir + f'/nr_{n}.csv'\n",
    "    control_path = file_dir + f'/nr_{n}_control_questions.csv'\n",
    "    df_raw = pd.read_csv(sentence_path, sep=';', encoding='utf-8', header=None,\n",
    "                         names = ['paragraph_id', 'sentence_id','sentence','control'],\n",
    "                         dtype={'paragraph_id':str, 'sentence_id': str, 'sentence': str, 'control': str})\n",
    "    df_control = pd.read_csv(control_path, sep=';', encoding='utf-8', header=0,\n",
    "                             dtype={'paragraph_id':str, 'sentence_id': str,'control_question': str, 'correct_answer':str})\n",
    "    assert df_raw[df_raw['control']=='CONTROL'].shape[0] == df_control.shape[0]\n",
    "    df = pd.merge(df_raw, df_control, how='left', on=['paragraph_id', 'sentence_id'])\n",
    "    return df\n",
    "\n",
    "def merge_QA(q,a):\n",
    "    if pd.isna(q):\n",
    "        label = np.nan\n",
    "    else:\n",
    "        if q.endswith('...'):\n",
    "            label = q.replace('...', ' '+a)\n",
    "        elif q.endswith('?'):\n",
    "            label = q + ' ' + a\n",
    "        else:\n",
    "            raise ValueError\n",
    "    return label\n",
    "\n",
    "file_dir = data_dir + '/ZuCo2/task_materials'\n",
    "df22_list = []\n",
    "for i in range(1,8):\n",
    "    df = extract_merge(file_dir, i)\n",
    "    df22_list.append(df)\n",
    "df22 = pd.concat(df22_list, ignore_index=True,)\n",
    "\n",
    "labels=[]\n",
    "for i in range(df22.shape[0]):\n",
    "    label = merge_QA(df22['control_question'][i], df22['correct_answer'][i])\n",
    "    labels.append(label)\n",
    "df22['raw label'] = labels\n",
    "df22['control'] = df22['control'].apply(lambda x: x == 'CONTROL')\n",
    "\n",
    "df22 = df22.rename(columns={'sentence': 'raw text'})\n",
    "df22 = df22.reindex(columns=['raw text', 'dataset', 'task', 'control', 'raw label',])\n",
    "df22['dataset'] =  ['ZuCo2'] * df22.shape[0]\n",
    "df22['task'] =  ['task2'] * df22.shape[0]\n",
    "print(df22.shape[0], df22.columns)\n",
    "print(df22['raw text'].nunique())\n",
    "# print(df22['raw text'].value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ZuCo2-task3 (Task-specific Reading)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "411 Index(['raw text', 'dataset', 'task', 'control', 'raw label'], dtype='object')\n",
      "392\n"
     ]
    }
   ],
   "source": [
    "def extract_task3(file_dir, n=1):\n",
    "    file_path = file_dir + f'/tsr_{n}.csv'\n",
    "    df_raw = pd.read_csv(file_path, sep=';', encoding='utf-8', header=None,\n",
    "                         names = ['paragraph_id', 'sentence_id', 'sentence', 'label'],\n",
    "                         dtype={'paragraph_id':str, 'sentence_id': str, 'sentence': str, 'label': str})\n",
    "    df = df_raw.rename(columns={'sentence': 'raw text', \n",
    "                                'label': 'raw label'})\n",
    "    df = df.reindex(columns=['raw text', 'dataset', 'task', 'control', 'raw label',])\n",
    "    df['control'] = df['raw label'].apply(assign_control_with_label)\n",
    "    unique_labels = df['raw label'].unique().tolist()\n",
    "    unique_labels.remove('CONTROL')\n",
    "    assert len(unique_labels) == 1\n",
    "    df['raw label'] =  unique_labels * df.shape[0]\n",
    "    df['dataset'] =  ['ZuCo2'] * df.shape[0]\n",
    "    df['task'] =  ['task3'] * df.shape[0]\n",
    "    return df\n",
    "\n",
    "def assign_control_with_label(label):\n",
    "    assert label in ['AWARD', 'EDUCATION', 'EMPLOYER', \n",
    "                   'FOUNDER', 'JOB_TITLE', 'NATIONALITY', \n",
    "                   'POLITICAL_AFFILIATION', 'VISITED', 'WIFE',\n",
    "                   'CONTROL']\n",
    "    return True if label == 'CONTROL' else False\n",
    "\n",
    "file_dir = data_dir + '/ZuCo2/task_materials'\n",
    "df23_list = []\n",
    "for i in range(1,8):\n",
    "    df = extract_task3(file_dir,i)\n",
    "    df23_list.append(df)\n",
    "df23 = pd.concat(df23_list, ignore_index=True,)\n",
    "print(df23.shape[0], df23.columns)\n",
    "print(df23['raw text'].nunique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Concat sub-tables and revise typos "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1888, 5) Index(['raw text', 'dataset', 'task', 'control', 'raw label'], dtype='object')\n"
     ]
    }
   ],
   "source": [
    "df = pd.concat([df11, df12, df13, df22, df23], ignore_index=True,)\n",
    "print(df.shape, df.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Revise each `raw text` according the typos we identified."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['raw text', 'dataset', 'task', 'control', 'raw label', 'input text'], dtype='object')\n",
      "1578 1556\n"
     ]
    }
   ],
   "source": [
    "typobook = {\"emp11111ty\":   \"empty\",\n",
    "            \"film.1\":       \"film.\",\n",
    "            \"–\":            \"-\",\n",
    "            \"’s\":           \"'s\",\n",
    "            \"�s\":           \"'s\",\n",
    "            \"`s\":           \"'s\",\n",
    "            \"Maria\":        \"Marić\",\n",
    "            \"1Universidad\": \"Universidad\",\n",
    "            \"1902—19\":      \"1902 - 19\",\n",
    "            \"Wuerttemberg\": \"Württemberg\",\n",
    "            \"long -time\":   \"long-time\",\n",
    "            \"Jose\":         \"José\",\n",
    "            \"Bucher\":       \"Bôcher\",\n",
    "            \"1839 ? May\":   \"1839 - May\",\n",
    "            \"G�n�ration\":  \"Generation\",\n",
    "            \"Bragança\":     \"Bragana\",\n",
    "            \"1837?October\": \"1837 - October\",\n",
    "            \"nVera-Ellen\":  \"Vera-Ellen\",\n",
    "            \"write Ethics\": \"wrote Ethics\",\n",
    "            \"Adams-Onis\":   \"Adams-Onís\",\n",
    "            \"(40 km?)\":     \"(40 km²)\",\n",
    "            \"(40 km˝)\":     \"(40 km²)\",\n",
    "            \" (IPA: /?g?nz?b?g/) \": \" \",\n",
    "            '\"\"Canes\"\"':    '\"Canes\"',\n",
    "\n",
    "            }\n",
    "\n",
    "def revise_typo(text):\n",
    "    # the typo book \n",
    "    book = typobook\n",
    "    for src, tgt in book.items():\n",
    "        if src in text:\n",
    "            text = text.replace(src, tgt)\n",
    "    return text\n",
    "\n",
    "df['input text'] = df['raw text'].apply(revise_typo)\n",
    "print(df.columns)\n",
    "print(df['raw text'].nunique(), df['input text'].nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.to_pickle(df, './data/tmp/zuco_label_input_text.df')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "glim-clone",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
