{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Match/merge/split eeg-label-texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(22335, 6) Index(['eeg', 'mask', 'text', 'dataset', 'task', 'subject'], dtype='object')\n",
      "(1888, 17) Index(['raw text', 'dataset', 'task', 'control', 'raw label', 'input text',\n",
      "       'text uid', 'sentiment label', 'relation label',\n",
      "       'lexical simplification (v0)', 'lexical simplification (v1)',\n",
      "       'semantic clarity (v0)', 'semantic clarity (v1)',\n",
      "       'syntax simplification (v0)', 'syntax simplification (v1)',\n",
      "       'naive rewritten', 'naive simplified'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.read_pickle('./data/tmp/zuco_eeg_128ch_1280len.df')\n",
    "print(df.shape, df.columns)\n",
    "\n",
    "label_table = pd.read_pickle('./data/tmp/zuco_label_8variants.df')\n",
    "print(label_table.shape, label_table.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check text match\n",
    "- make sure all texts (after revision) obtained from the original `.mat` files can be retrieved from the label table (from the `.csv` files)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    }
   ],
   "source": [
    "typobook = {\"emp11111ty\":   \"empty\",\n",
    "            \"film.1\":       \"film.\",\n",
    "            \"–\":            \"-\",\n",
    "            \"’s\":           \"'s\",\n",
    "            \"�s\":           \"'s\",\n",
    "            \"`s\":           \"'s\",\n",
    "            \"Maria\":        \"Marić\",\n",
    "            \"1Universidad\": \"Universidad\",\n",
    "            \"1902—19\":      \"1902 - 19\",\n",
    "            \"Wuerttemberg\": \"Württemberg\",\n",
    "            \"long -time\":   \"long-time\",\n",
    "            \"Jose\":         \"José\",\n",
    "            \"Bucher\":       \"Bôcher\",\n",
    "            \"1839 ? May\":   \"1839 - May\",\n",
    "            \"G�n�ration\":  \"Generation\",\n",
    "            \"Bragança\":     \"Bragana\",\n",
    "            \"1837?October\": \"1837 - October\",\n",
    "            \"nVera-Ellen\":  \"Vera-Ellen\",\n",
    "            \"write Ethics\": \"wrote Ethics\",\n",
    "            \"Adams-Onis\":   \"Adams-Onís\",\n",
    "            \"(40 km?)\":     \"(40 km²)\",\n",
    "            \"(40 km˝)\":     \"(40 km²)\",\n",
    "            \" (IPA: /?g?nz?b?g/) \": \" \",\n",
    "            '\"\"Canes\"\"':    '\"Canes\"',\n",
    "\n",
    "            \"111Senator\":   \"Senator\",\n",
    "            \"Creteil\":      \"Créteil\",\n",
    "            \"Zoonomia\":     \"Zoönomia\",\n",
    "            \"1902�19\":     \"1902 - 19\",\n",
    "            \"nee Darwin\":   \"née Darwin\",\n",
    "            \"Ruthy\":        \"Réthy\",\n",
    "            \"Eidgenoessische\":  \"Eidgenössische\",\n",
    "            \"40 km�\":       \"40 km²\",\n",
    "            \"King Leopold\":  \"King Léopold\",\n",
    "            }\n",
    "\n",
    "def revise_typo(text):\n",
    "    # the typo book \n",
    "    book = typobook\n",
    "    for src, tgt in book.items():\n",
    "        if src in text:\n",
    "            text = text.replace(src, tgt)\n",
    "    return text\n",
    "\n",
    "# def match_text(text_in_mat, raw_text_in_table, input_text_in_table):\n",
    "#     if (text_in_mat != raw_text_in_table) and (text_in_mat != input_text_in_table):\n",
    "#         text_in_mat_revised = revise_typo(text_in_mat)\n",
    "#         if (text_in_mat_revised != raw_text_in_table) and (text_in_mat_revised != input_text_in_table):\n",
    "#             return False\n",
    "#     return True\n",
    "\n",
    "df['revised text'] = df['text'].apply(revise_typo)\n",
    "input_texts = label_table['input text'].values.tolist()\n",
    "matched = df['revised text'].isin(input_texts)\n",
    "unmatched_rows = df[~matched]\n",
    "print(unmatched_rows.shape[0])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Assign `label id` according to the `index` of matched row in the label table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['eeg', 'mask', 'text', 'dataset', 'task', 'subject', 'revised text',\n",
      "       'label id'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "# sub_table = text_table[(text_table['dataset']==dataset_key) & (text_table['task']==task_key)]\n",
    "new_groups = []\n",
    "for (d, t, s), group in df.groupby(['dataset', 'task', 'subject']):\n",
    "    sub_label_table = label_table[(label_table['dataset']==d) & (label_table['task']==t)]\n",
    "    label_ids = []\n",
    "    left_label = None \n",
    "    \n",
    "    for i in range(group.shape[0]):\n",
    "        row = group.iloc[i]\n",
    "        src_text = row['revised text']\n",
    "        ideal_matching_text = sub_label_table.iloc[i]['input text']\n",
    "        task2_cache = set()\n",
    "        if src_text == ideal_matching_text:\n",
    "            label_id = sub_label_table.iloc[i].name\n",
    "            control_label = sub_label_table.iloc[i]['raw label']\n",
    "        else:\n",
    "            matched_rows = sub_label_table[sub_label_table['input text'] == src_text]\n",
    "            assert matched_rows.shape[0] > 0\n",
    "            if matched_rows.shape[0] == 1:\n",
    "                label_id = matched_rows.index.values.item()\n",
    "                control_label = matched_rows['raw label'].values.item()\n",
    "            elif matched_rows.shape[0] == 2:\n",
    "                # print(matched_rows.shape[0], d, t, s, i)\n",
    "                matched_labels = matched_rows['raw label'].values.tolist()\n",
    "                # print(matched_labels)\n",
    "                # print(src_text)\n",
    "                if t == 'task3':  # use context label to locate\n",
    "                    # assume the relation types are grouped as the text table\n",
    "                    assert i != 0, f\"{i}\" # letf label may not work\n",
    "                    assert matched_rows.iloc[1].name - matched_rows.iloc[0].name > 1  # hard to explain\n",
    "                    if left_label in matched_labels:\n",
    "                        the_matched_row = matched_rows[matched_rows['raw label']== left_label]\n",
    "                        control_label = left_label\n",
    "                        # print(f\"😓😓😓use left label: {left_label}\")\n",
    "                    else: \n",
    "                        right_row = group.iloc[i+1]\n",
    "                        right_matched_rows = sub_label_table[sub_label_table['input text'] == right_row['revised text']]\n",
    "                        if right_matched_rows.shape[0] >1:\n",
    "                            right_row = group.iloc[i+2]\n",
    "                            right_matched_rows = sub_label_table[sub_label_table['input text'] == right_row['revised text']]\n",
    "                        right_label = right_matched_rows['raw label'].values.item()\n",
    "                        # print(f\"😡😡😡use right label: {right_label}\")\n",
    "                        assert right_label in matched_labels\n",
    "                        the_matched_row = matched_rows[matched_rows['raw label']== right_label]\n",
    "                        control_label = right_label\n",
    "                    label_id = the_matched_row.index.values.item()\n",
    "                elif t == 'task2':\n",
    "                    if src_text in task2_cache:\n",
    "                        label_id = matched_rows.iloc[1].name\n",
    "                    else:\n",
    "                        label_id = matched_rows.iloc[0].name\n",
    "                    task2_cache.add(src_text)\n",
    "                    control_label = None\n",
    "                else:\n",
    "                    raise ValueError(f'{t}') \n",
    "                # print()\n",
    "            else: \n",
    "                raise ValueError(f'{matched_rows.shape[0]}')\n",
    "        left_label = control_label\n",
    "        label_ids.append(label_id)\n",
    "\n",
    "    group['label id'] = label_ids\n",
    "    new_groups.append(group)\n",
    "df = pd.concat(new_groups)\n",
    "print(df.columns)   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Merge eeg and labels according to the `label id`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(22335, 21) Index(['eeg', 'mask', 'subject', 'label id', 'raw text', 'dataset', 'task',\n",
      "       'control', 'raw label', 'input text', 'text uid', 'sentiment label',\n",
      "       'relation label', 'lexical simplification (v0)',\n",
      "       'lexical simplification (v1)', 'semantic clarity (v0)',\n",
      "       'semantic clarity (v1)', 'syntax simplification (v0)',\n",
      "       'syntax simplification (v1)', 'naive rewritten', 'naive simplified'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "df = df.reindex(columns=['eeg', 'mask', 'subject','label id'])\n",
    "df_merged = df.merge(label_table, left_on='label id', right_index=True, how='left')\n",
    "print(df_merged.shape, df_merged.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Strict split on unique texts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As you can find in current `label table`, there exist more than one forms of text overlap, which are actually intentional setups by ZuCo dataset (Please refer to [Section 3.2](https://arxiv.org/pdf/1912.00903) in ZuCo 2.0 paper). There are 4 overlapping conditions in addition to the inter-subject overlap:\n",
    "  1. between task2 and task3 (with same corpus of Wiki, offering comparison between reading paradigms, i.e., NR vs. TSR);\n",
    "  2. between zuco1 and zuco2 (to discuss the effect of experimental setting);\n",
    "  3. within task3 (same sentences annotated with different labels);\n",
    "  4. within zuco2-task2 (**unkown reason**).  \n",
    "\n",
    "\n",
    "Therefore, to ensure the generation will not benefit from the data leakage, we split samples by the `text uid`. \n",
    "- Due to the block design, adjacent samples have the same relation label in task3. We therefore adopt the  **random sampling** instead of *ordinal chuncking* or *interval sampling* to: (1) increase the diversity of relation types; and (2) exclude potential bias caused by the 'temporal adaptability of stimuli'. We set a fixed random seet to ensure the reproducibility.\n",
    "- We first collect all duplicated texts into training set, then conduct stratified random sampling across `dataset` and `task`. \n",
    "- For samples with duplicate text but different relation labels, they are also included into training set due to the shared `text uid`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['eeg', 'mask', 'subject', 'label id', 'raw text', 'dataset', 'task',\n",
      "       'control', 'raw label', 'input text', 'text uid', 'sentiment label',\n",
      "       'relation label', 'lexical simplification (v0)',\n",
      "       'lexical simplification (v1)', 'semantic clarity (v0)',\n",
      "       'semantic clarity (v1)', 'syntax simplification (v0)',\n",
      "       'syntax simplification (v1)', 'naive rewritten', 'naive simplified',\n",
      "       'phase'],\n",
      "      dtype='object')\n",
      "phase\n",
      "train    17908\n",
      "test      2227\n",
      "val       2200\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "uid_counts = label_table.value_counts(['text uid'])  # NOTE: find out subject-independent duplications\n",
    "duplicated_uids = uid_counts[uid_counts>1]\n",
    "duplicated_uids = [tp[0] for tp in duplicated_uids.index.tolist()]\n",
    "# print(duplicated_uids)\n",
    "\n",
    "new_groups = []\n",
    "for name, group in df_merged.groupby(['dataset', 'task']):\n",
    "    phases = []\n",
    "    text_uids = group['text uid'].values\n",
    "    text_uid_set = list(set(text_uids))\n",
    "    dup_uids = [uid for uid in text_uid_set if uid in duplicated_uids]\n",
    "    uniq_uids = [uid for uid in text_uid_set if uid not in duplicated_uids]\n",
    "    n = len(text_uid_set)\n",
    "    a = len(uniq_uids)\n",
    "    k = 0.8-0.2*(n-a)/a  # k is the ratio of train samples in singular_uids, derived from [k*a+n-a:(a-k*a)/2 = 8:1]\n",
    "    rng = np.random.default_rng(seed=42)\n",
    "    uniq_uids_shuffled = rng.permutation(uniq_uids)\n",
    "    train, val, test = np.split(uniq_uids_shuffled, [int(k*a), int((a+k*a)/2)])\n",
    "    train_uids = train.tolist() + dup_uids\n",
    "    val_uids = val.tolist()\n",
    "    test_uids = test.tolist()\n",
    "\n",
    "    for uid in text_uids:\n",
    "        if uid in train_uids:\n",
    "            phases.append('train')\n",
    "        elif uid in val_uids:\n",
    "            phases.append('val')\n",
    "        elif uid in test_uids:\n",
    "            phases.append('test')\n",
    "            \n",
    "    group['phase']=phases\n",
    "    new_groups.append(group)\n",
    "df_merged = pd.concat(new_groups)\n",
    "print(df_merged.columns)\n",
    "print(df_merged.value_counts(['phase']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "eeg                            object\n",
      "mask                           object\n",
      "subject                        object\n",
      "label id                        int64\n",
      "raw text                       object\n",
      "dataset                        object\n",
      "task                           object\n",
      "control                          bool\n",
      "raw label                      object\n",
      "input text                     object\n",
      "text uid                        int64\n",
      "sentiment label                object\n",
      "relation label                 object\n",
      "lexical simplification (v0)    object\n",
      "lexical simplification (v1)    object\n",
      "semantic clarity (v0)          object\n",
      "semantic clarity (v1)          object\n",
      "syntax simplification (v0)     object\n",
      "syntax simplification (v1)     object\n",
      "naive rewritten                object\n",
      "naive simplified               object\n",
      "phase                          object\n",
      "dtype: object\n"
     ]
    }
   ],
   "source": [
    "print(df_merged.dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.to_pickle(df_merged, './data/tmp/zuco_eeg_label_8variants.df')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "group: ('test', 'task1'), num of samples: 417, num of unique sentences: 40\n",
      "group: ('test', 'task2'), num of samples: 827, num of unique sentences: 65\n",
      "group: ('test', 'task3'), num of samples: 983, num of unique sentences: 77\n",
      "group: ('train', 'task1'), num of samples: 3316, num of unique sentences: 320\n",
      "group: ('train', 'task2'), num of samples: 6231, num of unique sentences: 461\n",
      "group: ('train', 'task3'), num of samples: 8361, num of unique sentences: 524\n",
      "group: ('val', 'task1'), num of samples: 406, num of unique sentences: 40\n",
      "group: ('val', 'task2'), num of samples: 824, num of unique sentences: 64\n",
      "group: ('val', 'task3'), num of samples: 970, num of unique sentences: 76\n"
     ]
    }
   ],
   "source": [
    "for name, group in df_merged.groupby(['phase','task']):\n",
    "    print(f\"group: {name}, num of samples: {group.shape[0]}, num of unique sentences: {group['text uid'].nunique()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "group: ('test', 'YAC'), num of samples: 44, num of unique sentences: 44\n",
      "group: ('test', 'YAG'), num of samples: 56, num of unique sentences: 56\n",
      "group: ('test', 'YAK'), num of samples: 62, num of unique sentences: 62\n",
      "group: ('test', 'YDG'), num of samples: 73, num of unique sentences: 73\n",
      "group: ('test', 'YDR'), num of samples: 72, num of unique sentences: 72\n",
      "group: ('test', 'YFR'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('test', 'YFS'), num of samples: 71, num of unique sentences: 71\n",
      "group: ('test', 'YHS'), num of samples: 73, num of unique sentences: 73\n",
      "group: ('test', 'YIS'), num of samples: 72, num of unique sentences: 72\n",
      "group: ('test', 'YLS'), num of samples: 63, num of unique sentences: 63\n",
      "group: ('test', 'YMD'), num of samples: 63, num of unique sentences: 63\n",
      "group: ('test', 'YMS'), num of samples: 65, num of unique sentences: 65\n",
      "group: ('test', 'YRH'), num of samples: 55, num of unique sentences: 55\n",
      "group: ('test', 'YRK'), num of samples: 51, num of unique sentences: 51\n",
      "group: ('test', 'YRP'), num of samples: 51, num of unique sentences: 51\n",
      "group: ('test', 'YSD'), num of samples: 72, num of unique sentences: 72\n",
      "group: ('test', 'YSL'), num of samples: 65, num of unique sentences: 65\n",
      "group: ('test', 'YTL'), num of samples: 66, num of unique sentences: 66\n",
      "group: ('test', 'ZAB'), num of samples: 106, num of unique sentences: 106\n",
      "group: ('test', 'ZDM'), num of samples: 103, num of unique sentences: 103\n",
      "group: ('test', 'ZDN'), num of samples: 99, num of unique sentences: 99\n",
      "group: ('test', 'ZGW'), num of samples: 93, num of unique sentences: 93\n",
      "group: ('test', 'ZJM'), num of samples: 85, num of unique sentences: 85\n",
      "group: ('test', 'ZJN'), num of samples: 72, num of unique sentences: 72\n",
      "group: ('test', 'ZJS'), num of samples: 92, num of unique sentences: 92\n",
      "group: ('test', 'ZKB'), num of samples: 92, num of unique sentences: 92\n",
      "group: ('test', 'ZKH'), num of samples: 97, num of unique sentences: 97\n",
      "group: ('test', 'ZKW'), num of samples: 85, num of unique sentences: 85\n",
      "group: ('test', 'ZMG'), num of samples: 105, num of unique sentences: 105\n",
      "group: ('test', 'ZPH'), num of samples: 90, num of unique sentences: 90\n",
      "group: ('train', 'YAC'), num of samples: 399, num of unique sentences: 367\n",
      "group: ('train', 'YAG'), num of samples: 442, num of unique sentences: 383\n",
      "group: ('train', 'YAK'), num of samples: 479, num of unique sentences: 425\n",
      "group: ('train', 'YDG'), num of samples: 568, num of unique sentences: 492\n",
      "group: ('train', 'YDR'), num of samples: 572, num of unique sentences: 497\n",
      "group: ('train', 'YFR'), num of samples: 327, num of unique sentences: 302\n",
      "group: ('train', 'YFS'), num of samples: 564, num of unique sentences: 488\n",
      "group: ('train', 'YHS'), num of samples: 592, num of unique sentences: 513\n",
      "group: ('train', 'YIS'), num of samples: 574, num of unique sentences: 498\n",
      "group: ('train', 'YLS'), num of samples: 524, num of unique sentences: 454\n",
      "group: ('train', 'YMD'), num of samples: 478, num of unique sentences: 414\n",
      "group: ('train', 'YMS'), num of samples: 515, num of unique sentences: 444\n",
      "group: ('train', 'YRH'), num of samples: 462, num of unique sentences: 417\n",
      "group: ('train', 'YRK'), num of samples: 458, num of unique sentences: 397\n",
      "group: ('train', 'YRP'), num of samples: 438, num of unique sentences: 382\n",
      "group: ('train', 'YSD'), num of samples: 580, num of unique sentences: 503\n",
      "group: ('train', 'YSL'), num of samples: 511, num of unique sentences: 439\n",
      "group: ('train', 'YTL'), num of samples: 528, num of unique sentences: 457\n",
      "group: ('train', 'ZAB'), num of samples: 833, num of unique sentences: 772\n",
      "group: ('train', 'ZDM'), num of samples: 838, num of unique sentences: 774\n",
      "group: ('train', 'ZDN'), num of samples: 792, num of unique sentences: 721\n",
      "group: ('train', 'ZGW'), num of samples: 677, num of unique sentences: 634\n",
      "group: ('train', 'ZJM'), num of samples: 710, num of unique sentences: 663\n",
      "group: ('train', 'ZJN'), num of samples: 612, num of unique sentences: 581\n",
      "group: ('train', 'ZJS'), num of samples: 716, num of unique sentences: 669\n",
      "group: ('train', 'ZKB'), num of samples: 758, num of unique sentences: 709\n",
      "group: ('train', 'ZKH'), num of samples: 775, num of unique sentences: 721\n",
      "group: ('train', 'ZKW'), num of samples: 664, num of unique sentences: 625\n",
      "group: ('train', 'ZMG'), num of samples: 817, num of unique sentences: 756\n",
      "group: ('train', 'ZPH'), num of samples: 705, num of unique sentences: 659\n",
      "group: ('val', 'YAC'), num of samples: 40, num of unique sentences: 40\n",
      "group: ('val', 'YAG'), num of samples: 59, num of unique sentences: 59\n",
      "group: ('val', 'YAK'), num of samples: 60, num of unique sentences: 60\n",
      "group: ('val', 'YDG'), num of samples: 70, num of unique sentences: 70\n",
      "group: ('val', 'YDR'), num of samples: 69, num of unique sentences: 69\n",
      "group: ('val', 'YFR'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('val', 'YFS'), num of samples: 70, num of unique sentences: 70\n",
      "group: ('val', 'YHS'), num of samples: 70, num of unique sentences: 70\n",
      "group: ('val', 'YIS'), num of samples: 69, num of unique sentences: 69\n",
      "group: ('val', 'YLS'), num of samples: 68, num of unique sentences: 68\n",
      "group: ('val', 'YMD'), num of samples: 58, num of unique sentences: 58\n",
      "group: ('val', 'YMS'), num of samples: 65, num of unique sentences: 65\n",
      "group: ('val', 'YRH'), num of samples: 59, num of unique sentences: 59\n",
      "group: ('val', 'YRK'), num of samples: 55, num of unique sentences: 55\n",
      "group: ('val', 'YRP'), num of samples: 55, num of unique sentences: 55\n",
      "group: ('val', 'YSD'), num of samples: 71, num of unique sentences: 71\n",
      "group: ('val', 'YSL'), num of samples: 66, num of unique sentences: 66\n",
      "group: ('val', 'YTL'), num of samples: 65, num of unique sentences: 65\n",
      "group: ('val', 'ZAB'), num of samples: 103, num of unique sentences: 103\n",
      "group: ('val', 'ZDM'), num of samples: 99, num of unique sentences: 99\n",
      "group: ('val', 'ZDN'), num of samples: 101, num of unique sentences: 101\n",
      "group: ('val', 'ZGW'), num of samples: 84, num of unique sentences: 84\n",
      "group: ('val', 'ZJM'), num of samples: 80, num of unique sentences: 80\n",
      "group: ('val', 'ZJN'), num of samples: 75, num of unique sentences: 75\n",
      "group: ('val', 'ZJS'), num of samples: 98, num of unique sentences: 98\n",
      "group: ('val', 'ZKB'), num of samples: 89, num of unique sentences: 89\n",
      "group: ('val', 'ZKH'), num of samples: 95, num of unique sentences: 95\n",
      "group: ('val', 'ZKW'), num of samples: 81, num of unique sentences: 81\n",
      "group: ('val', 'ZMG'), num of samples: 100, num of unique sentences: 100\n",
      "group: ('val', 'ZPH'), num of samples: 92, num of unique sentences: 92\n"
     ]
    }
   ],
   "source": [
    "for name, group in df_merged.groupby(['phase','subject']):\n",
    "    print(f\"group: {name}, num of samples: {group.shape[0]}, num of unique sentences: {group['text uid'].nunique()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "group: ('test', 'YAC', 'task2'), num of samples: 21, num of unique sentences: 21\n",
      "group: ('test', 'YAC', 'task3'), num of samples: 23, num of unique sentences: 23\n",
      "group: ('test', 'YAG', 'task2'), num of samples: 29, num of unique sentences: 29\n",
      "group: ('test', 'YAG', 'task3'), num of samples: 27, num of unique sentences: 27\n",
      "group: ('test', 'YAK', 'task2'), num of samples: 27, num of unique sentences: 27\n",
      "group: ('test', 'YAK', 'task3'), num of samples: 35, num of unique sentences: 35\n",
      "group: ('test', 'YDG', 'task2'), num of samples: 35, num of unique sentences: 35\n",
      "group: ('test', 'YDG', 'task3'), num of samples: 38, num of unique sentences: 38\n",
      "group: ('test', 'YDR', 'task2'), num of samples: 35, num of unique sentences: 35\n",
      "group: ('test', 'YDR', 'task3'), num of samples: 37, num of unique sentences: 37\n",
      "group: ('test', 'YFR', 'task2'), num of samples: 22, num of unique sentences: 22\n",
      "group: ('test', 'YFR', 'task3'), num of samples: 12, num of unique sentences: 12\n",
      "group: ('test', 'YFS', 'task2'), num of samples: 33, num of unique sentences: 33\n",
      "group: ('test', 'YFS', 'task3'), num of samples: 38, num of unique sentences: 38\n",
      "group: ('test', 'YHS', 'task2'), num of samples: 35, num of unique sentences: 35\n",
      "group: ('test', 'YHS', 'task3'), num of samples: 38, num of unique sentences: 38\n",
      "group: ('test', 'YIS', 'task2'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('test', 'YIS', 'task3'), num of samples: 38, num of unique sentences: 38\n",
      "group: ('test', 'YLS', 'task2'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('test', 'YLS', 'task3'), num of samples: 29, num of unique sentences: 29\n",
      "group: ('test', 'YMD', 'task2'), num of samples: 30, num of unique sentences: 30\n",
      "group: ('test', 'YMD', 'task3'), num of samples: 33, num of unique sentences: 33\n",
      "group: ('test', 'YMS', 'task2'), num of samples: 30, num of unique sentences: 30\n",
      "group: ('test', 'YMS', 'task3'), num of samples: 35, num of unique sentences: 35\n",
      "group: ('test', 'YRH', 'task2'), num of samples: 26, num of unique sentences: 26\n",
      "group: ('test', 'YRH', 'task3'), num of samples: 29, num of unique sentences: 29\n",
      "group: ('test', 'YRK', 'task2'), num of samples: 29, num of unique sentences: 29\n",
      "group: ('test', 'YRK', 'task3'), num of samples: 22, num of unique sentences: 22\n",
      "group: ('test', 'YRP', 'task2'), num of samples: 28, num of unique sentences: 28\n",
      "group: ('test', 'YRP', 'task3'), num of samples: 23, num of unique sentences: 23\n",
      "group: ('test', 'YSD', 'task2'), num of samples: 35, num of unique sentences: 35\n",
      "group: ('test', 'YSD', 'task3'), num of samples: 37, num of unique sentences: 37\n",
      "group: ('test', 'YSL', 'task2'), num of samples: 32, num of unique sentences: 32\n",
      "group: ('test', 'YSL', 'task3'), num of samples: 33, num of unique sentences: 33\n",
      "group: ('test', 'YTL', 'task2'), num of samples: 28, num of unique sentences: 28\n",
      "group: ('test', 'YTL', 'task3'), num of samples: 38, num of unique sentences: 38\n",
      "group: ('test', 'ZAB', 'task1'), num of samples: 39, num of unique sentences: 39\n",
      "group: ('test', 'ZAB', 'task2'), num of samples: 29, num of unique sentences: 29\n",
      "group: ('test', 'ZAB', 'task3'), num of samples: 38, num of unique sentences: 38\n",
      "group: ('test', 'ZDM', 'task1'), num of samples: 37, num of unique sentences: 37\n",
      "group: ('test', 'ZDM', 'task2'), num of samples: 27, num of unique sentences: 27\n",
      "group: ('test', 'ZDM', 'task3'), num of samples: 39, num of unique sentences: 39\n",
      "group: ('test', 'ZDN', 'task1'), num of samples: 31, num of unique sentences: 31\n",
      "group: ('test', 'ZDN', 'task2'), num of samples: 30, num of unique sentences: 30\n",
      "group: ('test', 'ZDN', 'task3'), num of samples: 38, num of unique sentences: 38\n",
      "group: ('test', 'ZGW', 'task1'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('test', 'ZGW', 'task2'), num of samples: 23, num of unique sentences: 23\n",
      "group: ('test', 'ZGW', 'task3'), num of samples: 36, num of unique sentences: 36\n",
      "group: ('test', 'ZJM', 'task1'), num of samples: 36, num of unique sentences: 36\n",
      "group: ('test', 'ZJM', 'task2'), num of samples: 17, num of unique sentences: 17\n",
      "group: ('test', 'ZJM', 'task3'), num of samples: 32, num of unique sentences: 32\n",
      "group: ('test', 'ZJN', 'task1'), num of samples: 25, num of unique sentences: 25\n",
      "group: ('test', 'ZJN', 'task2'), num of samples: 16, num of unique sentences: 16\n",
      "group: ('test', 'ZJN', 'task3'), num of samples: 31, num of unique sentences: 31\n",
      "group: ('test', 'ZJS', 'task1'), num of samples: 30, num of unique sentences: 30\n",
      "group: ('test', 'ZJS', 'task2'), num of samples: 27, num of unique sentences: 27\n",
      "group: ('test', 'ZJS', 'task3'), num of samples: 35, num of unique sentences: 35\n",
      "group: ('test', 'ZKB', 'task1'), num of samples: 37, num of unique sentences: 37\n",
      "group: ('test', 'ZKB', 'task2'), num of samples: 23, num of unique sentences: 23\n",
      "group: ('test', 'ZKB', 'task3'), num of samples: 32, num of unique sentences: 32\n",
      "group: ('test', 'ZKH', 'task1'), num of samples: 38, num of unique sentences: 38\n",
      "group: ('test', 'ZKH', 'task2'), num of samples: 28, num of unique sentences: 28\n",
      "group: ('test', 'ZKH', 'task3'), num of samples: 31, num of unique sentences: 31\n",
      "group: ('test', 'ZKW', 'task1'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('test', 'ZKW', 'task2'), num of samples: 16, num of unique sentences: 16\n",
      "group: ('test', 'ZKW', 'task3'), num of samples: 35, num of unique sentences: 35\n",
      "group: ('test', 'ZMG', 'task1'), num of samples: 37, num of unique sentences: 37\n",
      "group: ('test', 'ZMG', 'task2'), num of samples: 29, num of unique sentences: 29\n",
      "group: ('test', 'ZMG', 'task3'), num of samples: 39, num of unique sentences: 39\n",
      "group: ('test', 'ZPH', 'task1'), num of samples: 39, num of unique sentences: 39\n",
      "group: ('test', 'ZPH', 'task2'), num of samples: 19, num of unique sentences: 19\n",
      "group: ('test', 'ZPH', 'task3'), num of samples: 32, num of unique sentences: 32\n",
      "group: ('train', 'YAC', 'task2'), num of samples: 193, num of unique sentences: 188\n",
      "group: ('train', 'YAC', 'task3'), num of samples: 206, num of unique sentences: 206\n",
      "group: ('train', 'YAG', 'task2'), num of samples: 216, num of unique sentences: 213\n",
      "group: ('train', 'YAG', 'task3'), num of samples: 226, num of unique sentences: 215\n",
      "group: ('train', 'YAK', 'task2'), num of samples: 191, num of unique sentences: 188\n",
      "group: ('train', 'YAK', 'task3'), num of samples: 288, num of unique sentences: 274\n",
      "group: ('train', 'YDG', 'task2'), num of samples: 264, num of unique sentences: 258\n",
      "group: ('train', 'YDG', 'task3'), num of samples: 304, num of unique sentences: 289\n",
      "group: ('train', 'YDR', 'task2'), num of samples: 258, num of unique sentences: 253\n",
      "group: ('train', 'YDR', 'task3'), num of samples: 314, num of unique sentences: 299\n",
      "group: ('train', 'YFR', 'task2'), num of samples: 155, num of unique sentences: 153\n",
      "group: ('train', 'YFR', 'task3'), num of samples: 172, num of unique sentences: 171\n",
      "group: ('train', 'YFS', 'task2'), num of samples: 257, num of unique sentences: 251\n",
      "group: ('train', 'YFS', 'task3'), num of samples: 307, num of unique sentences: 292\n",
      "group: ('train', 'YHS', 'task2'), num of samples: 277, num of unique sentences: 271\n",
      "group: ('train', 'YHS', 'task3'), num of samples: 315, num of unique sentences: 300\n",
      "group: ('train', 'YIS', 'task2'), num of samples: 259, num of unique sentences: 253\n",
      "group: ('train', 'YIS', 'task3'), num of samples: 315, num of unique sentences: 300\n",
      "group: ('train', 'YLS', 'task2'), num of samples: 248, num of unique sentences: 243\n",
      "group: ('train', 'YLS', 'task3'), num of samples: 276, num of unique sentences: 263\n",
      "group: ('train', 'YMD', 'task2'), num of samples: 211, num of unique sentences: 207\n",
      "group: ('train', 'YMD', 'task3'), num of samples: 267, num of unique sentences: 253\n",
      "group: ('train', 'YMS', 'task2'), num of samples: 210, num of unique sentences: 205\n",
      "group: ('train', 'YMS', 'task3'), num of samples: 305, num of unique sentences: 290\n",
      "group: ('train', 'YRH', 'task2'), num of samples: 221, num of unique sentences: 217\n",
      "group: ('train', 'YRH', 'task3'), num of samples: 241, num of unique sentences: 239\n",
      "group: ('train', 'YRK', 'task2'), num of samples: 221, num of unique sentences: 216\n",
      "group: ('train', 'YRK', 'task3'), num of samples: 237, num of unique sentences: 228\n",
      "group: ('train', 'YRP', 'task2'), num of samples: 222, num of unique sentences: 217\n",
      "group: ('train', 'YRP', 'task3'), num of samples: 216, num of unique sentences: 211\n",
      "group: ('train', 'YSD', 'task2'), num of samples: 265, num of unique sentences: 259\n",
      "group: ('train', 'YSD', 'task3'), num of samples: 315, num of unique sentences: 300\n",
      "group: ('train', 'YSL', 'task2'), num of samples: 241, num of unique sentences: 235\n",
      "group: ('train', 'YSL', 'task3'), num of samples: 270, num of unique sentences: 255\n",
      "group: ('train', 'YTL', 'task2'), num of samples: 217, num of unique sentences: 212\n",
      "group: ('train', 'YTL', 'task3'), num of samples: 311, num of unique sentences: 296\n",
      "group: ('train', 'ZAB', 'task1'), num of samples: 307, num of unique sentences: 307\n",
      "group: ('train', 'ZAB', 'task2'), num of samples: 220, num of unique sentences: 220\n",
      "group: ('train', 'ZAB', 'task3'), num of samples: 306, num of unique sentences: 288\n",
      "group: ('train', 'ZDM', 'task1'), num of samples: 302, num of unique sentences: 302\n",
      "group: ('train', 'ZDM', 'task2'), num of samples: 220, num of unique sentences: 220\n",
      "group: ('train', 'ZDM', 'task3'), num of samples: 316, num of unique sentences: 297\n",
      "group: ('train', 'ZDN', 'task1'), num of samples: 230, num of unique sentences: 230\n",
      "group: ('train', 'ZDN', 'task2'), num of samples: 235, num of unique sentences: 235\n",
      "group: ('train', 'ZDN', 'task3'), num of samples: 327, num of unique sentences: 306\n",
      "group: ('train', 'ZGW', 'task1'), num of samples: 245, num of unique sentences: 245\n",
      "group: ('train', 'ZGW', 'task2'), num of samples: 164, num of unique sentences: 164\n",
      "group: ('train', 'ZGW', 'task3'), num of samples: 268, num of unique sentences: 261\n",
      "group: ('train', 'ZJM', 'task1'), num of samples: 286, num of unique sentences: 286\n",
      "group: ('train', 'ZJM', 'task2'), num of samples: 141, num of unique sentences: 141\n",
      "group: ('train', 'ZJM', 'task3'), num of samples: 283, num of unique sentences: 267\n",
      "group: ('train', 'ZJN', 'task1'), num of samples: 215, num of unique sentences: 215\n",
      "group: ('train', 'ZJN', 'task2'), num of samples: 130, num of unique sentences: 130\n",
      "group: ('train', 'ZJN', 'task3'), num of samples: 267, num of unique sentences: 259\n",
      "group: ('train', 'ZJS', 'task1'), num of samples: 272, num of unique sentences: 272\n",
      "group: ('train', 'ZJS', 'task2'), num of samples: 157, num of unique sentences: 157\n",
      "group: ('train', 'ZJS', 'task3'), num of samples: 287, num of unique sentences: 271\n",
      "group: ('train', 'ZKB', 'task1'), num of samples: 296, num of unique sentences: 296\n",
      "group: ('train', 'ZKB', 'task2'), num of samples: 165, num of unique sentences: 165\n",
      "group: ('train', 'ZKB', 'task3'), num of samples: 297, num of unique sentences: 281\n",
      "group: ('train', 'ZKH', 'task1'), num of samples: 301, num of unique sentences: 301\n",
      "group: ('train', 'ZKH', 'task2'), num of samples: 195, num of unique sentences: 195\n",
      "group: ('train', 'ZKH', 'task3'), num of samples: 279, num of unique sentences: 262\n",
      "group: ('train', 'ZKW', 'task1'), num of samples: 260, num of unique sentences: 260\n",
      "group: ('train', 'ZKW', 'task2'), num of samples: 116, num of unique sentences: 116\n",
      "group: ('train', 'ZKW', 'task3'), num of samples: 288, num of unique sentences: 271\n",
      "group: ('train', 'ZMG', 'task1'), num of samples: 297, num of unique sentences: 297\n",
      "group: ('train', 'ZMG', 'task2'), num of samples: 206, num of unique sentences: 206\n",
      "group: ('train', 'ZMG', 'task3'), num of samples: 314, num of unique sentences: 295\n",
      "group: ('train', 'ZPH', 'task1'), num of samples: 305, num of unique sentences: 305\n",
      "group: ('train', 'ZPH', 'task2'), num of samples: 156, num of unique sentences: 156\n",
      "group: ('train', 'ZPH', 'task3'), num of samples: 244, num of unique sentences: 226\n",
      "group: ('val', 'YAC', 'task2'), num of samples: 18, num of unique sentences: 18\n",
      "group: ('val', 'YAC', 'task3'), num of samples: 22, num of unique sentences: 22\n",
      "group: ('val', 'YAG', 'task2'), num of samples: 28, num of unique sentences: 28\n",
      "group: ('val', 'YAG', 'task3'), num of samples: 31, num of unique sentences: 31\n",
      "group: ('val', 'YAK', 'task2'), num of samples: 27, num of unique sentences: 27\n",
      "group: ('val', 'YAK', 'task3'), num of samples: 33, num of unique sentences: 33\n",
      "group: ('val', 'YDG', 'task2'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('val', 'YDG', 'task3'), num of samples: 36, num of unique sentences: 36\n",
      "group: ('val', 'YDR', 'task2'), num of samples: 33, num of unique sentences: 33\n",
      "group: ('val', 'YDR', 'task3'), num of samples: 36, num of unique sentences: 36\n",
      "group: ('val', 'YFR', 'task2'), num of samples: 21, num of unique sentences: 21\n",
      "group: ('val', 'YFR', 'task3'), num of samples: 13, num of unique sentences: 13\n",
      "group: ('val', 'YFS', 'task2'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('val', 'YFS', 'task3'), num of samples: 36, num of unique sentences: 36\n",
      "group: ('val', 'YHS', 'task2'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('val', 'YHS', 'task3'), num of samples: 36, num of unique sentences: 36\n",
      "group: ('val', 'YIS', 'task2'), num of samples: 33, num of unique sentences: 33\n",
      "group: ('val', 'YIS', 'task3'), num of samples: 36, num of unique sentences: 36\n",
      "group: ('val', 'YLS', 'task2'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('val', 'YLS', 'task3'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('val', 'YMD', 'task2'), num of samples: 29, num of unique sentences: 29\n",
      "group: ('val', 'YMD', 'task3'), num of samples: 29, num of unique sentences: 29\n",
      "group: ('val', 'YMS', 'task2'), num of samples: 31, num of unique sentences: 31\n",
      "group: ('val', 'YMS', 'task3'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('val', 'YRH', 'task2'), num of samples: 28, num of unique sentences: 28\n",
      "group: ('val', 'YRH', 'task3'), num of samples: 31, num of unique sentences: 31\n",
      "group: ('val', 'YRK', 'task2'), num of samples: 26, num of unique sentences: 26\n",
      "group: ('val', 'YRK', 'task3'), num of samples: 29, num of unique sentences: 29\n",
      "group: ('val', 'YRP', 'task2'), num of samples: 30, num of unique sentences: 30\n",
      "group: ('val', 'YRP', 'task3'), num of samples: 25, num of unique sentences: 25\n",
      "group: ('val', 'YSD', 'task2'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('val', 'YSD', 'task3'), num of samples: 37, num of unique sentences: 37\n",
      "group: ('val', 'YSL', 'task2'), num of samples: 32, num of unique sentences: 32\n",
      "group: ('val', 'YSL', 'task3'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('val', 'YTL', 'task2'), num of samples: 28, num of unique sentences: 28\n",
      "group: ('val', 'YTL', 'task3'), num of samples: 37, num of unique sentences: 37\n",
      "group: ('val', 'ZAB', 'task1'), num of samples: 38, num of unique sentences: 38\n",
      "group: ('val', 'ZAB', 'task2'), num of samples: 29, num of unique sentences: 29\n",
      "group: ('val', 'ZAB', 'task3'), num of samples: 36, num of unique sentences: 36\n",
      "group: ('val', 'ZDM', 'task1'), num of samples: 35, num of unique sentences: 35\n",
      "group: ('val', 'ZDM', 'task2'), num of samples: 28, num of unique sentences: 28\n",
      "group: ('val', 'ZDM', 'task3'), num of samples: 36, num of unique sentences: 36\n",
      "group: ('val', 'ZDN', 'task1'), num of samples: 32, num of unique sentences: 32\n",
      "group: ('val', 'ZDN', 'task2'), num of samples: 30, num of unique sentences: 30\n",
      "group: ('val', 'ZDN', 'task3'), num of samples: 39, num of unique sentences: 39\n",
      "group: ('val', 'ZGW', 'task1'), num of samples: 29, num of unique sentences: 29\n",
      "group: ('val', 'ZGW', 'task2'), num of samples: 23, num of unique sentences: 23\n",
      "group: ('val', 'ZGW', 'task3'), num of samples: 32, num of unique sentences: 32\n",
      "group: ('val', 'ZJM', 'task1'), num of samples: 31, num of unique sentences: 31\n",
      "group: ('val', 'ZJM', 'task2'), num of samples: 18, num of unique sentences: 18\n",
      "group: ('val', 'ZJM', 'task3'), num of samples: 31, num of unique sentences: 31\n",
      "group: ('val', 'ZJN', 'task1'), num of samples: 27, num of unique sentences: 27\n",
      "group: ('val', 'ZJN', 'task2'), num of samples: 20, num of unique sentences: 20\n",
      "group: ('val', 'ZJN', 'task3'), num of samples: 28, num of unique sentences: 28\n",
      "group: ('val', 'ZJS', 'task1'), num of samples: 35, num of unique sentences: 35\n",
      "group: ('val', 'ZJS', 'task2'), num of samples: 27, num of unique sentences: 27\n",
      "group: ('val', 'ZJS', 'task3'), num of samples: 36, num of unique sentences: 36\n",
      "group: ('val', 'ZKB', 'task1'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('val', 'ZKB', 'task2'), num of samples: 26, num of unique sentences: 26\n",
      "group: ('val', 'ZKB', 'task3'), num of samples: 29, num of unique sentences: 29\n",
      "group: ('val', 'ZKH', 'task1'), num of samples: 37, num of unique sentences: 37\n",
      "group: ('val', 'ZKH', 'task2'), num of samples: 24, num of unique sentences: 24\n",
      "group: ('val', 'ZKH', 'task3'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('val', 'ZKW', 'task1'), num of samples: 31, num of unique sentences: 31\n",
      "group: ('val', 'ZKW', 'task2'), num of samples: 18, num of unique sentences: 18\n",
      "group: ('val', 'ZKW', 'task3'), num of samples: 32, num of unique sentences: 32\n",
      "group: ('val', 'ZMG', 'task1'), num of samples: 39, num of unique sentences: 39\n",
      "group: ('val', 'ZMG', 'task2'), num of samples: 27, num of unique sentences: 27\n",
      "group: ('val', 'ZMG', 'task3'), num of samples: 34, num of unique sentences: 34\n",
      "group: ('val', 'ZPH', 'task1'), num of samples: 38, num of unique sentences: 38\n",
      "group: ('val', 'ZPH', 'task2'), num of samples: 20, num of unique sentences: 20\n",
      "group: ('val', 'ZPH', 'task3'), num of samples: 34, num of unique sentences: 34\n"
     ]
    }
   ],
   "source": [
    "for name, group in df_merged.groupby(['phase','subject', 'task']):\n",
    "    print(f\"group: {name}, num of samples: {group.shape[0]}, num of unique sentences: {group['text uid'].nunique()}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "glim",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
