{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Scrambling Sentences in Different GLUE tasks\n",
    "This file helps you scramble sentences in different tasks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "from vocab_mismatch_utils import *\n",
    "from data_formatter_utils import *\n",
    "from datasets import DatasetDict\n",
    "from datasets import Dataset\n",
    "from datasets import load_dataset\n",
    "from datasets import list_datasets\n",
    "import transformers\n",
    "import pandas as pd\n",
    "import operator\n",
    "from collections import OrderedDict\n",
    "from tqdm import tqdm, trange\n",
    "\n",
    "import collections\n",
    "import os\n",
    "import unicodedata\n",
    "from typing import List, Optional, Tuple\n",
    "\n",
    "from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace\n",
    "from transformers.utils import logging\n",
    "import torch\n",
    "logger = logging.get_logger(__name__)\n",
    "import numpy as np\n",
    "import copy\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "lemmatizer = WordNetLemmatizer() \n",
    "from word_forms.word_forms import get_word_forms\n",
    "from functools import partial\n",
    "import matplotlib.ticker as mticker\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rcParams[\"font.family\"] = \"DejaVu Serif\"\n",
    "font = {'family' : 'DejaVu Serif',\n",
    "        'size'   : 20}\n",
    "plt.rc('font', **font)\n",
    "\n",
    "FILENAME_CONFIG = {\n",
    "    \"sst3\" : \"sst-tenary\",\n",
    "    \"cola\" : \"cola\",\n",
    "    \"mnli\" : \"mnli\",\n",
    "    \"snli\" : \"snli\",\n",
    "    \"mrpc\" : \"mrpc\",\n",
    "    \"qnli\" : \"qnli\",\n",
    "    \"conll2003\" : \"conll2003\",\n",
    "    \"en_ewt\" : \"en_ewt\"\n",
    "}\n",
    "TASK_CONFIG = {\n",
    "    \"wiki-text\": (\"text\", None),\n",
    "    \"sst3\": (\"text\", None),\n",
    "    \"cola\": (\"sentence\", None),\n",
    "    \"mnli\": (\"premise\", \"hypothesis\"),\n",
    "    \"snli\": (\"premise\", \"hypothesis\"),\n",
    "    \"mrpc\": (\"sentence1\", \"sentence2\"),\n",
    "    \"qnli\": (\"question\", \"sentence\"),\n",
    "    \"conll2003\" : (\"tokens\", None),\n",
    "    \"en_ewt\" : (\"tokens\", None)\n",
    "}\n",
    "\n",
    "cache_dir = \"../tmp/\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Inline Helper Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# this tokenizer helps you to get piece length for each token\n",
    "modified_tokenizer = ModifiedBertTokenizer(\n",
    "    vocab_file=\"../data-files/bert_vocab.txt\")\n",
    "modified_basic_tokenizer = ModifiedBasicTokenizer()\n",
    "\n",
    "def token_stats_mapping(task, example):\n",
    "    # for tasks that have single sentence\n",
    "    if task == \"sst3\" or task == \"wiki-text\" or task == \"cola\":\n",
    "        original_sentence = example[TASK_CONFIG[task][0]]\n",
    "        if original_sentence != None and original_sentence.strip() != \"\" and original_sentence.strip() != \"None\":\n",
    "            if len(original_sentence.strip()) != 0:\n",
    "                tokens, token_dict = modified_tokenizer.tokenize(original_sentence)\n",
    "                for token, pieces in token_dict.items():\n",
    "                    if token in token_frequency_map.keys():\n",
    "                        token_frequency_map[token] = token_frequency_map[token] + 1\n",
    "                    else:\n",
    "                        token_frequency_map[token] = 1\n",
    "                    _len = len(pieces)\n",
    "                    if _len in token_by_length.keys():\n",
    "                        if token not in token_by_length[_len]:\n",
    "                            token_by_length[_len].append(token)\n",
    "                    else:\n",
    "                        token_by_length[_len] = [token]\n",
    "    # for tasks that have two sentences\n",
    "    elif task == \"mrpc\" or task == \"mnli\" or task == \"snli\" or task == \"qnli\":\n",
    "        original_sentence = example[TASK_CONFIG[task][0]]\n",
    "        if original_sentence != None and original_sentence.strip() != \"\" and original_sentence.strip() != \"None\":\n",
    "            tokens, token_dict = modified_tokenizer.tokenize(original_sentence)\n",
    "            for token, pieces in token_dict.items():\n",
    "                if token in token_frequency_map.keys():\n",
    "                    token_frequency_map[token] = token_frequency_map[token] + 1\n",
    "                else:\n",
    "                    token_frequency_map[token] = 1\n",
    "                _len = len(pieces)\n",
    "                if _len in token_by_length.keys():\n",
    "                    if token not in token_by_length[_len]:\n",
    "                        token_by_length[_len].append(token)\n",
    "                else:\n",
    "                    token_by_length[_len] = [token]\n",
    "                \n",
    "        original_sentence = example[TASK_CONFIG[task][1]]\n",
    "        if original_sentence != None and original_sentence.strip() != \"\" and original_sentence.strip() != \"None\":\n",
    "            tokens, token_dict = modified_tokenizer.tokenize(original_sentence)\n",
    "            for token, pieces in token_dict.items():\n",
    "                if token in token_frequency_map.keys():\n",
    "                    token_frequency_map[token] = token_frequency_map[token] + 1\n",
    "                else:\n",
    "                    token_frequency_map[token] = 1\n",
    "                _len = len(pieces)\n",
    "                if _len in token_by_length.keys():\n",
    "                    if token not in token_by_length[_len]:\n",
    "                        token_by_length[_len].append(token)\n",
    "                else:\n",
    "                    token_by_length[_len] = [token]\n",
    "    elif task == \"conll2003\" or task == \"en_ewt\":\n",
    "        # we have the words already!\n",
    "        tokens = example[TASK_CONFIG[task][0]]\n",
    "        for token in tokens:\n",
    "            if token in token_frequency_map.keys():\n",
    "                token_frequency_map[token] = token_frequency_map[token] + 1\n",
    "            else:\n",
    "                token_frequency_map[token] = 1\n",
    "            _len = 1 # all is one!\n",
    "            if _len in token_by_length.keys():\n",
    "                if token not in token_by_length[_len]:\n",
    "                    token_by_length[_len].append(token)\n",
    "            else:\n",
    "                token_by_length[_len] = [token]\n",
    "    else:\n",
    "        print(f\"task={task} not supported yet!\")\n",
    "    return example\n",
    "\n",
    "def generate_vocab_match_no_frequency_iv(token_by_length, token_frequency_map):\n",
    "    vocab_match = {}\n",
    "    tokens = list(task_token_frequency_map.keys())\n",
    "    tokens_copy = copy.deepcopy(tokens)\n",
    "    random.shuffle(tokens_copy)\n",
    "    for i in range(len(tokens)):\n",
    "        vocab_match[tokens[i]] = tokens_copy[i]\n",
    "    return vocab_match\n",
    "\n",
    "def generate_vocab_match_frequency_iv(token_by_length, token_frequency_map):\n",
    "    vocab_match = {}\n",
    "    for _, tokens in token_by_length.items():\n",
    "        tokens_copy = copy.deepcopy(tokens)\n",
    "        \n",
    "        # token_frequency_map, token_lemma_map)\n",
    "        \n",
    "        token_freq_tu = []\n",
    "        for t in tokens:\n",
    "            token_freq_tu.append((t, token_frequency_map[t]))\n",
    "        token_freq_tu = sorted(token_freq_tu, key=operator.itemgetter(1), reverse=True)\n",
    "        \n",
    "        matched_to = set([])\n",
    "        for i in trange(0, len(token_freq_tu)):\n",
    "            found = False\n",
    "            for j in range(0, len(token_freq_tu)):\n",
    "                word_i = token_freq_tu[i][0]\n",
    "                word_j = token_freq_tu[j][0]\n",
    "                if i != j and word_j not in matched_to and \\\n",
    "                    levenshteinDistance(word_i, word_j) > 0.3:\n",
    "                    matched_to.add(word_j)\n",
    "                    vocab_match[word_i] = word_j\n",
    "                    found = True\n",
    "                    break\n",
    "            if not found:\n",
    "                vocab_match[word_i] = word_i\n",
    "            \n",
    "    return vocab_match\n",
    "\n",
    "def generate_vocab_match_abstract(token_frequency_map):\n",
    "    vocab_match = {}\n",
    "    vocab_list = []\n",
    "    for k,v in token_frequency_map.items():\n",
    "        vocab_list.append(k)\n",
    "    random.shuffle(vocab_list)\n",
    "    \n",
    "    abstract_matches = []\n",
    "    abstract_len = 4\n",
    "    for i in range(0, abstract_len):\n",
    "        az_list = []\n",
    "        for j in range(ord('a'), ord('z')+1):\n",
    "            az_list.append(chr(j))\n",
    "        abstract_matches.append(az_list)\n",
    "    from itertools import product\n",
    "    abstract_matches = product(*abstract_matches)\n",
    "    good_abstract = []\n",
    "    for match in abstract_matches:\n",
    "        abstract = \"\".join(match)\n",
    "        if len(modified_tokenizer.tokenize(abstract)[0][0]) == abstract_len:\n",
    "            good_abstract.append(abstract)\n",
    "            if len(good_abstract) != 0 and len(good_abstract) % 10000 == 0:\n",
    "                print(f\"generating abstract token in progress: {len(good_abstract)}\")\n",
    "    \n",
    "    assert len(good_abstract) >= len(vocab_list)\n",
    "    \n",
    "    for i in range(0, len(vocab_list)):\n",
    "        vocab_match[vocab_list[i]] = good_abstract[i]\n",
    "    return vocab_match\n",
    "\n",
    "def generate_vocab_match_no_frequency_oov(wiki_token_frequency_map, \n",
    "                                          token_frequency_map,\n",
    "                                          match_high=False, \n",
    "                                          match_similar=False):\n",
    "    vocab_match = {}\n",
    "    wiki_vocab_to_use = []\n",
    "    if match_similar:\n",
    "        in_vocab_rank = []\n",
    "        for k, v in token_frequency_map.items():\n",
    "            in_vocab_rank.append(k)\n",
    "        wiki_tuples = []\n",
    "        for k, v in wiki_token_frequency_map.items():\n",
    "            if k not in task_token_frequency_map.keys():\n",
    "                wiki_tuples.append((k, v))\n",
    "        wiki_tuples = random.sample(wiki_tuples, k=len(in_vocab_rank))\n",
    "        wiki_tuples = sorted(wiki_tuples, key=lambda x: (x[1],x[1]), reverse=True)\n",
    "        for i in range(len(in_vocab_rank)):\n",
    "            vocab_match[in_vocab_rank[i]] = wiki_tuples[i][0]\n",
    "    else:\n",
    "        if not match_high:\n",
    "            for k, v in wiki_token_frequency_map.items():\n",
    "                if k not in task_token_frequency_map:\n",
    "                    if v == 1:\n",
    "                        wiki_vocab_to_use.append(k)\n",
    "            random.shuffle(wiki_vocab_to_use)\n",
    "            freq_idx = 0\n",
    "            for k, v in task_token_frequency_map.items():\n",
    "                vocab_match[k] = wiki_vocab_to_use[freq_idx]\n",
    "                freq_idx += 1\n",
    "        else:\n",
    "            in_vocab_rank = []\n",
    "            for k, v in token_frequency_map.items():\n",
    "                in_vocab_rank.append(k)\n",
    "            in_vocab_rank = in_vocab_rank[::-1] # reverse the order\n",
    "            idx = 0\n",
    "            for k, v in wiki_token_frequency_map.items():\n",
    "                if k not in task_token_frequency_map:\n",
    "                    wiki_vocab_to_use.append(k)\n",
    "                    idx += 1\n",
    "                    if idx == len(in_vocab_rank):\n",
    "                        break\n",
    "            assert len(wiki_vocab_to_use)==len(in_vocab_rank)\n",
    "            for i in range(len(wiki_vocab_to_use)):\n",
    "                vocab_match[in_vocab_rank[i]] = wiki_vocab_to_use[i]\n",
    "    return vocab_match\n",
    "\n",
    "def random_corrupt(task, tokenizer, vocab_match, example):\n",
    "    # for tasks that have single sentence\n",
    "    if task == \"sst3\" or task == \"wiki-text\" or task == \"cola\":\n",
    "        original_sentence = example[TASK_CONFIG[task][0]]\n",
    "        if original_sentence != None and original_sentence.strip() != \"\" and original_sentence.strip() != \"None\":\n",
    "            corrupted_sentence = corrupt_translator(original_sentence, tokenizer, vocab_match)\n",
    "            example[TASK_CONFIG[task][0]] = corrupted_sentence\n",
    "    # for tasks that have two sentences\n",
    "    elif task == \"mrpc\" or task == \"mnli\" or task == \"snli\" or task == \"qnli\":\n",
    "        original_sentence = example[TASK_CONFIG[task][0]]\n",
    "        if original_sentence != None and original_sentence.strip() != \"\" and original_sentence.strip() != \"None\":\n",
    "            corrupted_sentence = corrupt_translator(original_sentence, tokenizer, vocab_match)\n",
    "            example[TASK_CONFIG[task][0]] = corrupted_sentence\n",
    "        \n",
    "        original_sentence = example[TASK_CONFIG[task][1]]\n",
    "        if original_sentence != None and original_sentence.strip() != \"\" and original_sentence.strip() != \"None\":\n",
    "            corrupted_sentence = corrupt_translator(original_sentence, tokenizer, vocab_match)\n",
    "            example[TASK_CONFIG[task][1]] = corrupted_sentence\n",
    "    elif task == \"conll2003\" or task == \"en_ewt\":\n",
    "        original_tokens = example[TASK_CONFIG[task][0]]\n",
    "        corrupted_tokens = [vocab_match[t] for t in original_tokens]\n",
    "        example[TASK_CONFIG[task][0]] = corrupted_tokens\n",
    "    else:\n",
    "        print(f\"task={task} not supported yet!\")\n",
    "    return example\n",
    "\n",
    "def plot_dist(vocab, map1, map2, facecolor='b', post_fix=\"mismatched\"):\n",
    "    freq_diff = []\n",
    "    for k, v in vocab.items():\n",
    "        diff = abs(map1[k] - map2[v])\n",
    "        # print(diff)\n",
    "        freq_diff.append(diff)\n",
    "    fig = plt.figure(figsize=(8,3.5))\n",
    "    ax = fig.add_subplot(111)\n",
    "    g = ax.hist(freq_diff, bins=50, facecolor=facecolor, alpha=0.8)\n",
    "    # plt.grid(True)\n",
    "    # plt.grid(color='black', linestyle='-.')\n",
    "    import matplotlib.ticker as mtick\n",
    "    ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e'))\n",
    "    ax.set_yscale('log')\n",
    "    plt.xlabel(f\"Difference in Frequencies\")\n",
    "    # plt.ticklabel_format(axis=\"x\", style=\"sci\", scilimits=(0,0))\n",
    "    plt.ylabel(\"Frequency (LOG)\")\n",
    "    ax.xaxis.set_major_formatter(mtick.FormatStrFormatter('%1.0e'))\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Wiki-Text Frequency for OOV\n",
    "\n",
    "If you want to explore out-of-task vocab swapping, you can look into these lines of code!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "wiki = False # turn it on if you want to run. It will take a long time to finish.\n",
    "if wiki:\n",
    "    wiki_datasets = DatasetDict.load_from_disk(os.path.join(external_output_dirname, \"wikitext-15M\"))\n",
    "    wiki_train_df = wiki_datasets['train']\n",
    "    wiki_eval_df = wiki_datasets['validation']\n",
    "    wiki_test_df = wiki_datasets['test']\n",
    "\n",
    "    token_by_length = {}\n",
    "    token_frequency_map = {}\n",
    "    wiki_train_df = wiki_train_df.map(partial(token_stats_mapping, \"wiki-text\"))\n",
    "    wiki_eval_df = wiki_eval_df.map(partial(token_stats_mapping, \"wiki-text\"))\n",
    "    wiki_test_df = wiki_test_df.map(partial(token_stats_mapping, \"wiki-text\"))\n",
    "    token_frequency_map = sorted(token_frequency_map.items(), key=operator.itemgetter(1), reverse=True) # copy\n",
    "    wiki_token_frequency_map = OrderedDict(token_frequency_map)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get mismatched vocab!\n",
    "Simply set your task name below for new tasks! Be sure to have your dataset downloaded into the right folder!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# task setups\n",
    "task_name = \"sst3\"\n",
    "# random seeds\n",
    "# WARNING: this may change your results as well. Try it a few different seeds.\n",
    "seed = 42\n",
    "torch.manual_seed(seed)\n",
    "np.random.seed(seed)\n",
    "random.seed(seed)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Step 1: Get frequency distribution of the dataset.\n",
    "\n",
    "Note that we are cheating here a little bit. For tasks that have a pair of sentences, we consider the word frequency all together for both sentences. This is the worst case scenario as well, since two sentences may have different vocab distributions! If this works, we are confident more fine-grained scrambling should work."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "if task_name == \"conll2003\":\n",
    "    dataset = load_dataset(\"conll2003\", cache_dir=cache_dir)\n",
    "    train_df = dataset[\"train\"]\n",
    "    eval_df = dataset[\"validation\"]\n",
    "    test_df = dataset[\"test\"]\n",
    "elif task_name == \"en_ewt\":\n",
    "    dataset = load_dataset(\"universal_dependencies\", \"en_ewt\", cache_dir=cache_dir)\n",
    "    train_df = dataset[\"train\"]\n",
    "    eval_df = dataset[\"validation\"]\n",
    "    test_df = dataset[\"test\"]\n",
    "else:\n",
    "    # handle token data differently\n",
    "    train_df = pd.read_csv(os.path.join(external_output_dirname, FILENAME_CONFIG[task_name], \n",
    "                                        f\"train.tsv\"), \n",
    "                           delimiter=\"\\t\")\n",
    "    eval_df = pd.read_csv(os.path.join(external_output_dirname, FILENAME_CONFIG[task_name], \n",
    "                                       f\"dev.tsv\"), \n",
    "                          delimiter=\"\\t\")\n",
    "    test_df = pd.read_csv(os.path.join(external_output_dirname, FILENAME_CONFIG[task_name], \n",
    "                                       f\"test.tsv\"), \n",
    "                          delimiter=\"\\t\")\n",
    "\n",
    "    train_df = Dataset.from_pandas(train_df)\n",
    "    eval_df = Dataset.from_pandas(eval_df)\n",
    "    test_df = Dataset.from_pandas(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "**** Dataset Statistics ****\n",
      "training example = 159274\n",
      "validation example = 1100\n",
      "testing example = 2210\n",
      "****************************\n"
     ]
    }
   ],
   "source": [
    "print(\"**** Dataset Statistics ****\")\n",
    "print(f\"training example = {len(train_df)}\")\n",
    "print(f\"validation example = {len(eval_df)}\")\n",
    "print(f\"testing example = {len(test_df)}\")\n",
    "print(\"****************************\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "675dc9a7744d410f8fd2d03dd571c16b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=159274.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5885a7a4af96438eac97deb4538e9ae5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=1100.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a0f7df8ec6024282b94fca82ccb5896f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=2210.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "token_by_length = {} # overwrite this everytime for a new dataset\n",
    "token_frequency_map = {} # overwrite this everytime for a new dataset\n",
    "train_df = train_df.map(partial(token_stats_mapping, task_name))\n",
    "eval_df = eval_df.map(partial(token_stats_mapping, task_name))\n",
    "test_df = test_df.map(partial(token_stats_mapping, task_name))\n",
    "task_token_by_length = OrderedDict(token_by_length)\n",
    "task_token_frequency_map = sorted(token_frequency_map.items(), key=operator.itemgetter(1), reverse=True)\n",
    "task_token_frequency_map = OrderedDict(task_token_frequency_map)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Step 2(a): Frequency matched in-vocab swap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10136/10136 [00:10<00:00, 1005.16it/s]\n",
      "100%|██████████| 2389/2389 [00:00<00:00, 4490.77it/s]\n",
      "100%|██████████| 4430/4430 [00:01<00:00, 2393.40it/s]\n",
      "100%|██████████| 743/743 [00:00<00:00, 7479.24it/s]\n",
      "100%|██████████| 111/111 [00:00<00:00, 10249.60it/s]\n",
      "100%|██████████| 16/16 [00:00<00:00, 7912.85it/s]\n",
      "100%|██████████| 1/1 [00:00<00:00, 4899.89it/s]\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAADnCAYAAADMzFY6AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3deZxcVZn/8c+XQCBBIEAA2QOyLyoYGLaRsI3gyDiIIIjsi6DOEEcdxg0Qld+IG64oIEQdFHEDVFRQCLtAUBBBDFsIArInLEkIhOf3xzkFxc2trrrVVV3d1d/363Vft/uec289dep299P3nnOuIgIzMzOzXlmi1wGYmZnZ6OZkxMzMzHrKyYiZmZn1lJMRMzMz6yknI2ZmZtZTTkbMzMysp5yMmJmZWU85GTEzM7OecjJiZmZmPbVkqxUlLQnsAewKbAqsASwLvAjMBe4FZgCXRMTMzodqZmZm/UjNpoOXtAQwFfgIsCqgJscM4ArgwxFxSyeCNDMzs/41YDIiaQJwMbATMB24HLgdeIh0NeR5YAwwDpgIrA9MBv4NWAV4f0Sc1b3wzczMbKRrmIzkKyK/JyUbH4yIm1s+qDQWeC9wKnBwRFzYgVjNzMysDw2UjBwMvAt4R0QsbOvg0i7AV4FtImJB21GamZlZ32raZ8SSiRMnxqRJk3odhpmZ2Yh08803Px4Rq5SVtTyaZrSbNGkSM2bM6HUYZmZmI5Kk+xuVeZ4RMzMz66mmV0YkfQRYKn97VURcU1e2qGSXayPizR2Kz8zMzPrcgMmIpG2Az5HmDnkSuKdYpWS3HSVtExE3dSZEMzMz62fNbtO8lZSInAKsHhE/KpRHRCxRW0hXUGYC7+x8qGZmZtaPmiUjOwHfjYhPRcSLJeWvujISEYuAbwDbdyg+MzMz63PN+oxsCnx0gPLflmy7DPh42xENAUkfAL4G7BIR03sVx+TJA5d78I6ZmY0Gza6MrAzc3agwIvYq2TwLmDCImLpK0hqk5+yYmZnZMNAsGXkBqDpz6higbJTNcPE10jT1ZmZmNgw0S0YeAjapeMzN8n5tkTRR0gWSQtJhTeouL+nLkmZLWiBppqRPSFqqQf29SQlW2e0lMzMz64FmyciVwJEVj/nevF9lkvYlPRV4jxbqLg9cC+wHvBtYETghLxdJGlOovyzwWeCD7cRmZmZm3dEsGTkTOFbSIa0cLHcMPRz4VtVAJB1HuoVyBHBRC7t8FtgCOCYiromI+RHxc+BkYC9SUlTv08C3IuLhqrGZmZlZ9wyYjETEzcC3gXMlXSrpPZI2kjReyXhJm0g6XNJVwFeAb+f9qroN2DwiftWsoqTlgKOAh4FfF4qnkeZG+WBd/a2Bf6KNJMnMzMy6q5UH5R0PLAscCuw2QD0B5wL/0U4g9dPMt2BXYBnghig8djginpA0E9hY0kYRMRP4V2AccLkk8r4Ap0uaA7w3Iv7WTtxmZmY2OE0flBcRiyLicODtwNWkkTKqWxaR+ojsHRFHRsRLXYy3Zsu8ntWgvLZ9S4CI+HREbB0RUyJiCnBALp+atzkRMTMz65FWrowAEBG/AH6RO4KuBywPPA3cFxHPdSm+Rl6b1081KJ+T16sN5kUkHQMcA7DOOusM5lBmZmbWQMvJSE1OPP7ShViqGJfXLzQoX5jX44sFkk4Htsvfni7projYr+wgEXEmqRMvkydPjrI6ZmZmNjiVkhFJ44AdgY1IV0bmAH8DromIRolBN8zP69L5RICxeT2vWBARU7sSkZmZmbWlpWQkJyGfAo4ldWYtelrSl4H/jYiFJeWd9o+8XrFBeW06+keGIBYzMzMbhKbJiKTVgN8Bm+dNjwJ/J111GA+sBawKnATsJeltEfFEd8J92W15vV6D8kmFemZmZjZMDZiMKI2DvQhYlzSZ2Hcj4v6SeuuShv5+GLiAgYcAd8LlwPPAtpJUP7xX0sqk20j35GG9ZmZmNow1G9p7ELAhsH1EnFKWiABExP0RcQqwA/DGPK1710TEM8B3gNVJs63WO4w05Pj0bsZgZmZmndEsGTkQODkibm/lYBHxF1Lfkpamjx+kjwF3AGdK2knSOEn7kK7gXIpnWzUzMxsRmiUjWwEXVjzmz4FtqgYiaVJ+Um+QbvlAmoY+JM0q1o+IuaQrMT8Bfkga2XNaXvaOiBerxmBmZmZDr1kH1pVIz3+p4uG8XyURMYt0e6XKPnOBqXkxMzOzEajZlZGngZUrHnPlvJ+ZmZlZU82SkVuBt1Q85p55PzMzM7OmmiUjPwZOkbRqKweT9FpSB9YfDTYwMzMzGx2aJSPnAM8AN0o6QNLYskqSxko6EPgDMBc4t7NhmpmZWb8asANrRLwo6e2kScbOA+ZJuo00A+t80gPr1gK2JM3GOht4e0Qs6mrUZmZm1jeaTgcfEfdKmgx8FdifV554W+8l4Hzg+Ih4vLMhmpmZWT9r6UF5OcF4t6T/IXVo3QhYgXRLZibw24iY3bUozczMrG+1lIzU5ITjrC7FYmZmZqNQpWRkIJK+CmyRv42I6PbD8szMzKwPdCwZAf4EPEHq1PqRDh7XzMzM+ljHkpGIOBdA0gScjJiZmVmLms0z0o7owjHNzMysT3UjGTEzMzNrmZMRMzMz66kBkxFJlw9VIGZmZjY6NbsysvOQRGFmZmajVrPRNJJ0b8Vj+taPmZmZtayVob2T2jiuR9SYmZlZS5olIwGsB6jCMVcgTYBmZmZm1lTT2zRVH4AnaQWqJS9mZmY2ig2YjERE5f4fETEX9xsxMzOzFjlpMDMzs55qmIxI2kHSjoM5uKQxkt4raZnBHMfMzMz610BXRh4Ffilpr3YOLGks8GNg94hY0M4xzMzMrP81TEYi4m7gfcDFkn4uaSdJTW/rSJoo6f3A3cAGwJEdi9bMzMz6TrMOrD+UNBc4C7gSWCBpJvAQMAdYCIwBlgFWIQ0DXps0muZC4IiIeLp74ZuZmdlI13TSs4i4RNLGwNHAe4A3Am9oUH0u6dbMNyPiqo5FaWZmZn2rlRlYiYhngS8DX87ziGwCrAGMB14kJSH3AndHxEtditXMzMz6UEvJSL08j8gNXYjFzMzMRiHPM2JmZmY95WTEzMzMesrJiJmZmfWUkxEzMzPrKScjZmZm1lNORszMzKynKiUjkk7sViBmZmY2OlW9MnKSpPW7EomZmZmNSlWTEQG3SvqdpP0kVZ40zczMzKxeO31G1gIuBj4JPCjpNEkbdDYsMzMzGy2qJiOHR8TciPhqRLwe2If0tN5bJF0u6V2Slup8mGZmZtavKiUjEfHdwvfXRcThwJrAr4BzSFdLPi9po86FaWZmZv1q0EN78y2ajwMnAOOAicC/ADMkXSFpz8G+hpmZmfWvqkN7F+X1kpL2l/R74G/Ah4GxwLeArSPiDcDqwA+AMyR9rLNhm5mZWb+oOhpGkj4HHErqKyLgRuBM4PyImFerGBHPAWdJ+g0wAzi1MyGbmZlZP2lnaO5HgLnAGcCZEfHnJvWXz4uZmZnZYtpJRg4HLoiI+c0qStoRuIJ0K8fMzMxsMVWTkSuLI2qauB5YEXix4uuYmZnZKFF1aO8ujcokTZS0RKH+SxHxXEQ8326AZmZm1t+qjqbZQdKTeTm9UHwQaY6R/ToXnpmZmfW7qrdpDs37fAq4oFB2AbAy8D1JcyLisg7EZ2ZmZn2uajKyHXB0RPyoWBARDwMnSpoNfAxwMmJmZmZNVZ2BdW3g103q/BjYsr1wzMzMbLSpmows1cI+ApZuLxwzMzMbbaomI7cBxzapcxzQbCI0MzMzM6B6n5EzgO9K2gw4nzSZ2XPAssAmwAHAu4FDOhmkmZmZ9a9KyUhEfF/SPwHvIw3lLRLw9Yg4rxPBdZqkt5FiX5qUQD0FnNDClPZmZmbWJVVv0xARHwDeDvwWeAJYlNe/BvaOiP/saISdNQ34v4jYLSK2A24Ffi9ptd6GZWZmNnq182waIuIXwC86HMtQuCoiflD3/ReBE4B/Ab7fm5DMzMxGt8pXRloh6cRB7j9R0gWSQtJhTeouL+nLkmZLWiBppqRPSFqqWDci3lHYVHvYn0f/mJmZ9UhXkhHgpHZ3lLQvcDuwRwt1lweuBfYjdZxdkXSl4wTgIkljmhxie2ABcHG78ZqZmdngVE5GJL1D0s8k3SbpHkn3Fpd2g5F0HPA14AjgohZ2+SywBXBMRFwTEfMj4ufAycBewHsHeC0BnwQ+ERGPthuzmZmZDU7VB+UdD/wE+HdgdWAMaQRNcWnXbcDmEfGrFmJZDjgKeJjFZ4WdBgTwwQEOcSpwf0R8sb1QzczMrBOqdmCdCvwIOH6gqwmSXmonmIi4pkL1XYFlgBsiIgrHeULSTGBjSRtFxMxCfFOBzYB924nTzMzMOqfqbZrVgakt3Nb4VJvxVFF7/s2sBuW17a96To6ko4C3AvtHxIuS1pe0e1ciNDMzs6aqXhm5hxZuw0TEUCQjr83rpxqUz8nrl+cQkXQA8HHgMGDL1G2EN5GSrN8VDyDpGOAYgHXWWacTMZuZmVlB1Ssj/0saqTIgSYvaC6eScXn9QoPyhXk9vm7b94FJwHTgprx8q9ELRMSZETE5IiavssoqgwrWzMzMylW9MiJgc0l/Ig2HfYhX5uoYarXXXWw+kWxsXs+rbYiIRnXNzMysR6omI9Pqvn5DXkehjkq2dcM/8nrFBuUT8vqRIYjFzMzM2tTOdPCHNykX8J02jlvVbXm9XoPySYV6ZmZmNgxVTkYi4rvN6kg6p71wKrkceB7YVpLqh/dKWhnYCLinOKzXzMzMhpd2hvY2FRHdmma+/jWeIV2BWZ0022q9w0hXaE7vdhxmZmY2OJWShohYrP9Ffj5Mr3wMuAM4U9JOksZJ2oc0HfylDDBSxszMzIaHtq5gSDpS0tWSngOezNt2lnS2pDXbDUbSpPyk3gAOzZvPzdtmFetHxFxgB9IU9T8kzS1yWl72jogX243FzMzMhkalPiOSxpKG9O7BK5Of1fpqPARMBq6RtENEPFw1mIiYRcVn2+SEZGpezMzMbISpemXkQ8A/A58BNgdWqBVExF3A1sDNtDAxmpmZmRlUT0YOAo6IiJMi4q+5E+nLIuIl4EQW71BqZmZmVqpqMrI+cGGTOvcBa7UXjpmZmY02VZOR+cCqTepsSN0U7GZmZmYDqZqMXAd8s9FwXklLAKcCVw82MDMzMxsdqs7AeipwFTBb0o+BWwEkHU6alv0g0i2aHToZpJmZmfWvSslIRFwv6WDgbOBI0rBe5e8FPAu8OyJu7nSgZmZm1p/aeTbN+ZKuIE1Ktg1peO8c4AbgexHxWGdDNDMzs37WzlN7a9PCn9bhWMzMzGwU6soD7ST9VzeOa2ZmZv2nW0/X/XyXjmtmZmZ9puqzaS7vViBmZmY2OlXtMzKlxXrRvIqZmZlZe6NpFru1I2kcsCbwDtKTew8dfGhmZmY2GlTtM/KNso0RMT8i7o6I04Af4j4jZmZm1qJKyUhE/EcL1S4D9msvHDMzMxttujGaZlNguS4c18zMzPpQ1dE0bx6geFlgY+B4YMZggjIzM7PRo2oH1ukMPFJGwGPA1HYDMjMzs9GlnengTynZFsB84C7g0oiYN6iozMzMbNRoZ2jvp7oRiJmZmY1OVTuwrteVKMzMzGzUqpqMnNvOi3gaeTMzM2ukajKyc5uv0+5+ZmZm1ueq9hmRpEVdicTMzMxGparJyFWkfiNrk4bwPgA8Q5rkbG1gFeCOXFYj4J8HHamZmZn1parJyEeAs4EjI+J3xUJJewAn5fJ767a/NKgozczMrG9VTUY+AxwbEdeXFUbEZZLmAV8E9qkr8nBgMzMzK1W1A+u2NJ/q/UbgVdPGe24SMzMza6RqMrIUsEmTOpsCS7cXjpmZmY02VW/T3ARMk7R/RNxTLJS0ATAN+EMHYrMWTJ48cPkMP7LQzMyGuarJyMeBy4E7Jf0J+BvwHOmJvZsAWwELgSkdjNHMzMz6WKVkJCKuk7QXcBYwOS/17gKOiogbOxSfmZmZ9bl2HpR3haQNgR1JV0KWB+YCf4yI6zocn5mZmfW5yskIQEQEcE1ezMzMzNrWVjICIOl1pNs0q0XEVyWtCDwfEfM6Ft0o16xzqpmZWT+oOrQXSZtLuhqYCfwA+HIu2hZ4UNLBHYzPzMzM+lylZCQP3b0a2IE0+dmFdcVXk2Zo/bakXToWoZmZmfW1qrdpTiZdETmoNs9I7Sm++fbMFyU9T3qGzRUdjNPMzMz6VNXbNLsCR5RNeFbnfOAN7YdkZmZmo0nVZGQl0lwiA3kx1zMzMzNrqmoy8hCwc5M6uwAPtBeOmZmZjTZVk5ELgfMkHSppsf4mknYFvg78tBPBmZmZWf+rmox8GngaOAd4WtIMAElXSZoNXJbL/19HozQzM7O+VSkZiYingJ1IV0iWBrYGlLetAfwEeHNEPN3hOM3MzKxPtfNsmkeAfSWtRpqBdQVgDnBTRDzW4fjMzMysz1VKRiSdk7+8ISK+Dfyq8yGZmZnZaFK1z8hhwDakfiFmZmZmg1b1Ns0LwFsi4qFuBGNmZmajT9UrI/cA0aySpEPaC8fMzMxGm6rJyBnA8S3UO7eNWMzMzGwUqnqb5jbgbZIuBS4gzbQ6v+NRmZmZ2ahRNRm5nHSbRsBunQ/HzMzMRpvK84wApzQpF/DJNo5rPTJ58uCPMWNG9+PoxGvY8DRczkGz0aCVn7eh/nlqZ9KzTzWrI+nE9sIxMzOz0aZqB9btW6y3XtVAzMzMbHQa8MqIpHVqX0fE7Ii4oZWDRsT9gw3MzMzMRodmt2lm5XVIWi4i5nU5HjMzMxtlWukzsh5ARMyrv1JSExGzOx6VmZmZjRrNkpEo3HKZxSszsCp/PaYLcZmZmdkooYjGs7tLWhQRY+q+X5eUhNzLK1dMRkX/EEmPAZ1+rxOBxzt8TBuY23zouc17w+0+9NzmA1s3IlYpK6iUjDTbbtVImhERHZhhwVrlNh96bvPecLsPPbd5+6oO7W2JpDd347hmZmbWf7qSjABXdOm4ZmZm1me6lYyoS8ftN2f2OoBRyG0+9NzmveF2H3pu8zY16zPyEq+MnqnEfUrMzMysFa3MM9LOVY62EhgzMzMbfZrdpomIWKLqMiSRd5Ck5SV9WdJsSQskzZT0CUlL9Tq2GkkHSbpJ0jxJj0k6X9LrBqg/VtJJku7K7+l+SV+Q9JqhjLskLknaO8c/W9JCSXMkXSXp4F7GVlSlzSXtIOlzkm6U9Kik5yT9TdK3JW0w1LG3In8OIWlY/fPQj+0uaXdJF0t6RNLzkh6Q9CtJB/Y6Nui/Ns/tfUn+HTNf0t2S/k/SJr2OrUbSnpKulPSMpKck/VLSmxrUfb2kkyVdI+mh/J7ukfQDSVsNdexdERENF+C+gco7vV8vFmB54Dbg78BOwDhgH+AZ4BJgzDCI8RTS1aaPAssBGwHXAHOAzUrqLwX8DpgL7J3f087Aw8AfgWV7+F4+kd/LZcBWwHhgU+CivP2cXrd3m23+LPAP4N+BCXnZD3gyl23V6/dUiHd54IH8HqPX8fRzuwMn5/iPAlYCls3xPg38ZhjE11dtDnwov5/LgS3y75htgVuB54Epw6DNj8gxfjWfE2sBP2sUH/AX4DngUNJcJssBbyFNRPoCsFev39Og26TXAfR6Ab6WT4q3FrbXTuj3deE1pwOzWqw7GXgJuKCwfU1gAXBDyT6lsQP75u2n9bC9P5N/kb2msH0scE+Ob9cR2ObPAgeWbP9gfk/f7VWbN3iP3wSup8vJyGhv9/wHO4B/Kyn7EHCW27yj730sKcl7CVi1ULZNjm+x99SB153W6s8RKfGYD9xA7reZty+bfzfOBpYu7PMX4KMlx9onv6cre9XmnVpG3C2VTpK0HOm/lYeBXxeKp5E+5A8OcVhFU0n9ds6p3xgRDwK/BbaVtGNtuyTlfV4Avl841kWk/16Ok7RMN4MewIOkX1bP1m+MiIWkqyUAuw95VK9Wqc2zvUjtW3RXXk/odJDtyrEfARzd61gK+rHdTwXujIiLiwUR8cWI6PVn0G9tviLpqsHjEfFooez2vH790Ia0mGOBZYBzI2cUABHxHPAjYG3gnYV9Dge+XXKs4dDmHTGqkxFgV9JJcUP9SQEQEU8AM4ENJG1U2y5pfO6LcWfui/G4pAsb3evrgLfm9R9Kyq7P63+t2/Z6UuZ9e0Q8U185Il4EbgJeA/RkYrqIOCMiTmhQXIv3VZ2mR0CbExFXR/lTrbfL6993KLZBkTQWOAv4fET8pUldt/sgSHoj6RbkVRX2cZsPQkQ8AjwETJS0aqF487z+R/1GSUtKOl7SLbkvxhxJl0nq1j9Ftfa8vqSsUZvfFBFPltTveZt3ymhPRrbM61kNymvbt4T0i4I0odsJpPvAE0j3IpcErpW0WyeDk7QWKdOfExFzmsVX+HoW5cr2GS5qSd/Lv7xHSJsXjzFW0nqSTgD+mzT3wDc6GecgfIL0c/+ZgSq53Tui9oditqRDJN1c98fut5J2rq/sNu+Yw0j9Xc6XtIWkcZK2Bc7O5V+vVZS0BPBT4EvAd4BVgM1It0oulXRIJwOTNCYfH8p/R9e2DdTmYyStJelY4Iukq1QndjDM3uj1faJeLqSTMoATG5SfT13fC9IHH8AphXorkP6rv58WOrzS4j1d0v3cAO5tUL5nLr+xbtuHGaAjKPC/9LjfSIO4ViLdo/4jr76POuzbvFC+SS4P4Cng/cBSvW7fHNvmuY3/uW5baZ8Rt3tH2vvzOZ77cnvtRuoXsCVwI/AisL/bvCttvzHpakHULXcCxxXq/Ucu+15h+5j8uT0NrNTC600r+zkqqTcxv95LTdr00Qbly9S9n/nAScD4Xrd3J5bRfmVkXF6/0KB8YV6Pl7Qkr9xjP7u+UkTMJfU5WYd062fI4xvkPsPBaaQfsEMi/9SNoDavj+tO0pWHdYCPkEYq/EHSpM6FWF3+D/AsYFpEXN2krtu9M5bP60mk8/r3EfFcRNwGHEi6HfktSa9xm3eOpHcCM0idWN9A6kOyE+kfnRXy1Yma4/K62OaLgB/nffftYHiDbfMFESFgdeAQ4EjgtnxLcEQb7cnI/LxuNJ/I2LyeR8pYlwOeiojZJXUfyOuXn9goaYryPA71C2mY7bplZZKmtBnfYPbpKUkHkS6tHhSv7scwUtr8VSJ5ICLOzu9ra+B7jeoPkfcD65JuATTjdu+sRyLiyvoNEXEPqZ/GisAeuM07QtJ6pI77zwBvj4g/R8SzEXEt8DHgU8AFue5ypD49kIb9FpW1+aQGbX5oLi9r88PqjtmR388R8Y+I+DHwb6Rk9yeSlh5on+GulRlY+1mtI9OKDcprPZQfIV0qBVhRA08StVrd1/OAv5XUWYd0Mt5TUlZ/ElaJbzD79IykPUj/lRwTET8rFI+UNm8oIn4h6VHgnyVtFhF3tLJfJ0lamzSq45D8X3YzbvfOeCqvy5ILSLdddgA2BGojP9zmg/Mu0q2MX0Shk21EzJL0B+AdknYgtX/NHKnhZOP1bf4C5W2+OulKWFlZ/c/cU6SrH2MlrVDy81i1zW+RNIPUt2g30txYI9JoT0Zuy+v1GpRPqqtXy2QfjIi1Wjl4RNxI+o/nVSRNByZFxICzAUbE3yXNASZImhCLdzKrj4/C1628p55S6q3+c+D9EXFOSZXa+x3ubd7M/cCqwAZAL35B70YaQfWzRr9w6/4AXkm6igJu98H6a143m8k58LneKZPy+uEG5bXtb+SVqyEBLBNpeoEBRRryXNbm04BDW2jzRZLuyK+/HnBLg/irtvm2pDYfsUb7bZrLSTPebavCb2lJK5NGd9wTETNJnZ/mAquXXQ6TtISktyj1UO+kWqa7XUnZ9oU6AH8mzeWxWb4MWR/jkqSJf56lwnDDblAaGXAhcHx9IiJpc0nvyt+OiDaXdLCkmwc43up5/XQHYqssIqZFhMqWujq1bVNwu3dKrQPlukp9dorWzes7cZt3yhOFOIrWyOsXIs3rcTup7846ZZUl7SJpw86GWLnN3yzpsQGO1+s274xe9p4dDgtpGFrQeAbWD9Rtq3WyPKrkOPsCi0j/kTR7zem0PkNibdbA4gyJa5BGRtxUss9HGHgG1i/0uM13JSVEZe14GDB9JLV5jnkhsF7JsabkYz1O+u+r5+d8Ib5Go2nc7p1p35/lON5W2L5+bsMHa/G5zTvS3tvnGB4ExhXK1iX987kI2CBve1+u/5mSY9VGG725hdedVvZz1KDu2jSegfVhUl+VZeq219p1x5JjbZzf0/PAmr06zzvy2fU6gF4vpPvjt1P+bJrfAkvW1R0HXEfKQI8g3UtcEXg36TLrSS2+Zsu/LHL9z+aT8X9Indw2JD07Yi6wRUn9pUjzFZQ9m+YWClOxD3F770K6b/0waeh0cbmBVycjw77NSZ3XgjRl8545vonAAaT78i8A+/b6XG/wPhslI273zrTvmqQ+I/eTJhocS3peyg3552A3t3nH2/ybOcbfkIZRL0tKUv5EIfEg3R34KemP+YdJE0YuB7wtf27ntvia08p+jgaof3SOpfZsmjVJievC+nMi1905150N7E+aC2UF0oR1d+eyqb1s8458br0OYDgs+YM9nZSRPk+aYveTwNiSussAHyclMAuAx0j32fer8HqVflnkfQ4GbiZl1I+TeoRvMED9pUk9x+/J72k2aR6D5Xrc1tN49dj/smX6SGpzUvL3NuA80twEtf9UZpFGFryx1+d4Id7DBmj7KW73jrf3qsAZ+ffLQtIf7R8Am5fUdZt3ps0Pyu0whzSfy+Okfy73Kak7hnSF5GZSgvgUaR6Yo4ElWny9aVRIRvI+ewFXk64SzyHdmplcUk+kqyNnkWYFn5fPo78DPwF27nV7d2JRfrNmZmZmPTHaO7CamZlZjzkZMTMzs55yMmJmZmY95WTEzMzMesrJiJmZmfWUkxEzMzPrKScjZmZm1lNORmzEkjSrwSO7n5N0v6SfSTpioEdrS7okH2eVkrLlJZ0u6T5JCyQ9nI85IZe/RdI1kp6S9KykWyQd0s333I8kfVTSHEm7dPE1Gp0rjZZJ3YplNBuKz9pGJicjNmJFxKQoeZvT0v0AAApoSURBVNgb6bka7yRNlfw14C5JOzY4zHqkGTKXLSn7OukJtseQHu29H2n2yQmSNgUuJs3suBFpOueZpOfuWDXrkGZBXrVbL9DoXCkuNH7atXVG1z9rG5k8A6uNeJLSQ1bq/tjUlb0e+B2wPOmZD9cWyseRHqj1ZMm+TwD3RcTkum2vI02tfyzpuRJ7R8Qvc9kKwPiIaPT4ciuRn2j72oh4aAheq+G5kssnkaY5Xy8iZnU7ntFmKD9rG1l8ZcT6WkT8mfSMiaWBH0gaWyifX5aIZCuRnhtRX/+eiHgBWDlveraubK4Tkeoi4qVh9MfpftKD0u7vdSD9aJh91jaMOBmxvhcRFwF3ki4RHwggaUqhj8CUWn1J02v/QQM719WZVtsPOCmXX1HWz0DS2pLOlvSgpOclzZZ0hqTX1scm6c7C8SdLukzS07XthfqHSPpD7qPyjKRrJb2rUOeAwnvbUNKpOZYFkv4kafeytpI0XtInJf01130kv96pktYvqd80noEU4jy5bvv/FMqWk/QtSY9Jmi/paklbt/o6LcQxKbf1uhHxbESEpO0KMewi6T9z2yysfWZ1x1hJ0peU+hg9L+kfks6TtGGD19xB0hVKfZzm5s9929r5l5fptdganKtfr6/b4HW2l/RLSU/mz/R2SR9TXV8qSa8tvMZh+bO9I7+XWZKmDtB+B0u6Pp8HcyT9WdKZqrs92uizrisfL+mk/DOxQNLjki6U9KaSuitJ+pykmZLmKf18XaLUR2x8ozhtGOv1k/q8eBnsQn7ibJM6X8v1fljYfjKFp9UWjju9wfEG2m9T4FFSn5V/Il2V2Z70NOjZwBqF+pPysa4DrgLeBLwG+Fz9+yL1YQnSI99XJF2d+Uze9omSOKblsl8CR5H6vWxOevz7PGDtQv3xpEfbzwH2JT3SfnXg0/k4FxbqV4pngM9mSt7n5JKy6bnsJ8A+pNtt2wMPkp5+u2wnzpW6z2DSAJ/1pcBppP5Ba5GerDst13lt/nwfA96SP/Mtcns+BWxZOOYupKfd3gK8MdffEfhzPkfKYqx8rgIHkJ5ae0l+j+NICfm8/H6WKNQ/LB/rMuArpP5XqwPn5+3vLHmN2nnwcdLVxAn5NZ4B5rTyWdede/NyzMsA65PO3QWkW6y1ugJuIj21dqf8ntYCvtWofbwM/6XnAXjxMtil0R+YQp2pud5Nhe2Vf8G3sN+MXLZrYfuueft5he21P4Qvkv4zr21fCzg/f713rnNVyetdlffdpLB9Wt7nq4Xt++ftUwvbv5i3H13yGr+kLhlpJ54BPpvSP1C5bHou+6/C9v/O2/+9nXNlgGXSAJ/17wrbDwKOy1//NNc5olBnA+Al4Nq6bUuQEtVFwEaF+ns0Op+rnqukBOlZ4GlgQqHslLLPmleSkVsL21ejPCGtnQfnlcT0YVpPRmrn3imF7SuQkpr7gTF52+tz3S8U6irXW6x9vAz/xbdpbLSo9e1YoZsvImlb0pWN+yLi8vqy/P1jwDslvaZk9xsj4v66+n+PiAPyt8fl9dkl+50PjAEObhDWRYXv/5rXL98+kLQkqW9NAD8qOcZXSP8t1wwmnnY0fQ9VRHujaC4oHOO8iKjdetuHdKXj/wp17iYlpzvU3a7ZEXgd8KeImFmofxnpHOmEQ0ijxH4SEXMKZefn9WEN9r24ENcjwJMs3t618+CHJce4APhesyDrzj0onE8RMRf4NekWa22kWuT17pJWrqsbwA6kKyw2wjgZsdFiubye2+XX2Tavb2lQ/gAwFtiyQVk7x63tN7mkDKDYYbCWmNXfW9+Y1EYPR8TTxQNExGUR8Y0OxdOOVt5DtzX6fCaT/iu/IyIWDrBfrT22yuu/VXydqlr5jLZWGuFSVNbJ9FkWb+9t8nqx9xIRsyPiP1uIcxPSufdURMweINZa+/2FNELuDcB9kr4jaS9JS0XEgxExv4XXtGFmyV4HYDZENsrrmQPWGrzalZd9VOh8WrBaybaBfonWjnurVDoqtdExy45bi6v+QBPy+rkBYuhUPJWV/IEpew+DOf6sFo7V6POptcVWLX7my+d1o7Z+pkkcrarF9RVJX2lQZ5kcT/HKSdl7DRZvo6rnTZlanCu20n4REZLeSpoD6DjgiLw8IunzwJfyVRIbQZyMWN9T+mtZGz3y2y6/XO2X+nkR8Z4OH3cisGG+9N9ptbjLJn/rRTwjSa3tro2InVqoX7s616itl2uwvfQP7ACjR2pxHR0RZbfTOqF2HrR63jQ6BsCDEbFWKztEGl5/OnC6pK2A9wDvBb5A6tD6mUHEYz3g2zQ2GuxH6kg4m1fulXdL7X71pLJCSRMl7dnG8MNmx91OaYK3dt1J6ui4utLkbcXjbyvpfUMYz0hyE6mT6qSyQknL5s+81r/hj3m9SYPjrd1ge+1qRfEP/5oN6jf7jDaqHybcphvzerH3ImkNSVMlNZtt9U5Sgra6Sh7dIGkJpUcvrJW/nyjpzbXyiPhTRHwI2DNvemc7b8R6y8mI9TWluSjOIHUuPLDBPf2OiYgZpF/Q20vaqKTKSaShkAsqHvqbeX1YsUDSmqRRJ2+seMyXRcQi4CzSZfj9S6p8gVeuLnU9npEkIh4ljaZZU+XztxxL6hT8fP7+OtIw4K2K54ikPYDFnpOU3ZXXGxe279Og/vdI/TwOyp1E619nDKmD6WCfpXRGXh9QUnYs8FFSx9eG8rl3JunvUVmn531IQ5Nr72EL4Pz8HurdntfuMzICORmxvqP0gLttJH0BuIZ0P3v3iLhuiEI4hDQi4peSdleasGuNPNHT0cD7IuKlKgeMiEtIl6XfnSd7Wj9PErUzabTBlZSPaKjiRFIi9XlJ+0oaJ2ktSV8jdbj92BDHM5J8gNQf6fuS3iFpRUmrSPoAaR6W4yPiWXh51MdRwELgAklvkDQ2TxB2MnBHg9e4lHReHZ+vVC0n6R280on0VXKSdDhpiPjF+XXGS9qMlIisyCuT97Ul0qMQvk46Dz6e3/cESUcBJ+T3/WILhzoJuB74Up64bLV8rHcD3yEN+Z1VV3914CxJr5O0tNKEfF/PZY36x9hw1uuxxV68tLsAsyifK2Ie6ZbMhcCRwNIl+04p2zeXTW9w3EmN9iNPflV3/DVI/zXOJv1H/ABp4q7JhXplrzV9gPd8ICnBqs0fcStpPodxTd7b9AFeb0rdvuNJSclfSVdvHgJ+DGzabjxNPsOytpzCK/NdLNbGDT73SW2eK1MG2GdSg30Oa1B/AmmiurvzZ/4w8BtSIlxWfzvgctL5+hRp+PIGtc+owT6Tc3svICUm3yTdtqmP78OFfbYh/Sw8QbpqcBfpuUprNvss8vaTS8pOLux7MCmZeI50JeRKYM9WPuu68mVIE6fdXvf+rgT2KxxnXD4/Lsmfa+08/TV1k6N5GVmLH5RnZjaMKE3rvnM0eJifWT/ybRozMzPrKScjZmZm1lNORszMhgFJJ+dJv3bO3zd8Eq9Zv3GfETMzM+spXxkxMzOznnIyYmZmZj3lZMTMzMx6ysmImZmZ9ZSTETMzM+spJyNmZmbWU/8fLGQuQ7EMpzIAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 576x252 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# mismatch script\n",
    "vocab_match_frequency_iv = \\\n",
    "    generate_vocab_match_frequency_iv(task_token_by_length, \n",
    "                                      task_token_frequency_map)\n",
    "# plot the frequency distribution afterwards (the diff)\n",
    "plot_dist(vocab_match_frequency_iv, task_token_frequency_map, task_token_frequency_map, \n",
    "          facecolor='b', post_fix=\"matched\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# write to the disk if things looking good!\n",
    "corrupted_train_dataset = train_df.map(partial(random_corrupt, \n",
    "                                               task_name,\n",
    "                                               modified_basic_tokenizer, \n",
    "                                               vocab_match_frequency_iv))\n",
    "corrupted_validation_dataset = eval_df.map(partial(random_corrupt, \n",
    "                                                   task_name,\n",
    "                                                   modified_basic_tokenizer, \n",
    "                                                   vocab_match_frequency_iv))\n",
    "corrupted_test_dataset = test_df.map(partial(random_corrupt, \n",
    "                                             task_name,\n",
    "                                             modified_basic_tokenizer, \n",
    "                                             vocab_match_frequency_iv))\n",
    "\n",
    "corrupted_datasets = DatasetDict({\"train\":corrupted_train_dataset, \n",
    "                                  \"validation\":corrupted_validation_dataset, \n",
    "                                  \"test\":corrupted_test_dataset})\n",
    "corrupted_datasets.save_to_disk(f\"../data-files/{FILENAME_CONFIG[task_name]}-corrupted-matched\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "original text: they tried to squeeze too many elements into the film\n",
      "scrambled text: best equal s clue if off difficult good a as\n"
     ]
    }
   ],
   "source": [
    "# pick out an example sentence to show the scrambling.\n",
    "example = train_df[random.randint(0, len(train_df)-1)]\n",
    "for name in TASK_CONFIG[task_name]:\n",
    "    if name != None:\n",
    "        example_sentence = example[name]\n",
    "        print(f\"original {name}: {example_sentence}\")\n",
    "        if task_name != \"conll2003\" and task_name != \"en_ewt\":\n",
    "            corrupted = corrupt_translator(example_sentence, modified_basic_tokenizer, vocab_match_frequency_iv)\n",
    "        else:\n",
    "            corrupted = [vocab_match_frequency_iv[t] for t in example_sentence]\n",
    "        print(f\"scrambled {name}: {corrupted}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Step 2(a): Frequency mis-matched in-vocab swap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAADnCAYAAADMzFY6AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3debgcRb3/8feHJZAgkEjYtwMCIqtoQFmuhO0KKteLiIIIhkUF9Uq86vW6san8FDdEZVWJeFHEDRFRQSHsW1AUUYwgIQjInrCEne/vj6ohQ6dn6TkzmXPOfF7P008nXdXV367pc05Nd1W1IgIzMzOzflmi3wGYmZnZYHNjxMzMzPrKjREzMzPrKzdGzMzMrK/cGDEzM7O+cmPEzMzM+sqNETMzM+srN0bMzMysr9wYMTMzs75aqt2MkpYCdgN2Bl4BrAEsBzwLzAf+AcwCLoiI2d0P1czMzMYitZoOXtISwHTgo8AqgFqUGcAlwEci4sZuBGlmZmZjV9PGiKSJwHnADsBM4GLgZuBu0t2Qp4AlgfHAZGB9YArwH8DKwPsj4vTehW9mZmajXcPGSL4j8jtSY+NDEXFD24VK44D3AscBB0TEuV2I1czMzMagZo2RA4C3A2+JiKc7KlzaCTgR2Doinuw4SjMzMxuzWvYZsWTy5MkxNDTU7zDMzMxGpRtuuOGBiFi5LK3t0TSDbmhoiFmzZvU7DDMzs1FJ0h2N0jzPiJmZmfVVyzsjkj4KLJ3/e1lEXFGX9lzJLldGxOu6FJ+ZmZmNcU0bI5K2Br5AmjvkIeC2YpaS3baXtHVEXN+dEM3MzGwsa/WY5g2khsixwOoR8cNCekTEErWFdAdlNvDW7odqZmZmY1GrxsgOwHcj4piIeLYk/UV3RiLiOeCbwLZdis/MzMzGuFZ9Rl4BfLxJ+m9Ktl0EfLLjiAbJlCnN0z16x8zMBkCrOyMrAbc2SoyIPUo2zwEmDiMmMzMzGyCtGiPPAFVnTl0SKBtlY2ZmZraIVo2Ru4GNK5a5Sd7PzMzMrKVWjZFLgUMqlvnevN+IJekDkkLS1H7HYmZmNuhadWA9DbhG0vURcWarwiR9ADgIeE03gusFSWsAH+13HGZmZpY0vTMSETcApwJnSLpQ0jslbSRpgpIJkjaWdJCky4CvAafm/UaqrwPH9TsIMzMzS9p5Ud4RwHLAu4BdmuQTcAbwX8MJSNJk4CRgH+CgiJjRJO8KwDHA3sAqwFzgTOALEfFMSf49SZ1yy4Ykm5mZWR+0fFFeRDwXEQcBbwYuJ42UUd3yHKmPyJ4RcUhEPN9pMJL2Bm4Gdmsj7wrAlaRGyzuAScDH8vJzSUsW8i8HfA74UKfxmZmZWfe1/dbeiPhFREwl/dHfgjQ76xbApIjYOSJ+OZxAJB1OeoRyMPDzNnb5HLAZ8J6IuCIinoiInwFHA3uQOtLW+wxwSkTcM5w4zczMrLvabozURMTjEfHniLgqrx/vUiw3AZu206iRtDxwKHAP8KtC8gzS+3Q+VJf/VaROtad0KVYzMzPrknb6jLxA0nhge2AjYAVgHvA34IqyPhpVRMQVFbLvDCwLXBsRUSjnQUmzgZdL2igiZgNvBMYDF0si7wtwgqR5wHsj4m/Did/MzMw601ZjJDdCjgEOI3VmLXpE0leBz0fE012Mr5HN83pOg/Q5wMtzvtkR8RnSYxoAJA0BtwPTI2Jmj2I0MzOzNrRsjEhaFfgtsGnedB/wT2ABMAFYizSS5ShgD0lviogHexPuC1bL64cbpM/L61WHcxBJ7wHeA7DOOusMpygzMzNroGmfEaVnGj8H1iV1DF0vIlaLiCkR8bq8Xg1Yj3TnZFPgnB7HDOmRC6RhumVqd2cmFBMknQCcnf97gqQfNTpIRJyWz3HKyiuv3HGwZmZm1lirOyP7AxsC20bEzY0yRcQdwLGSfgpcKmnviPhJF+MseiKvl26QPi6vFxQTImJ6TyIyMzOzjrQaTbMfcHSzhki9iPgz6Q7JgcMNrIV/5fWkBukT8/reHsdhZmZmw9SqMbIVcG7FMn8GbN1ZOG27Ka/Xa5A+VMhnZmZmI1SrxshLSXN5VHFP3q+XLgaeArbJ/VpeIGkl0tDj2/KwXjMzMxvBWjVGHgFWqljmSnm/nomIR4FvA6uTZlutN400Tf0JvYzBzMzMuqNVY+SPwOsrlrl73q/XPgH8BThN0g6SxkvaizTq50I826qZmdmo0Kox8iPSKJlV2ilM0mqkDqw/rBqIpCFJISlIbwgGOCNvm1PMHxHzge2AHwM/IM0tcnxe9oyIZ6vGYGZmZotfq8bId4BHgesk7StpXFkmSeMk7QdcA8wHzqgaSETMiQg1WIYa7DM/IqZHxNoRsUxEbBgRn1lMs8CamZlZFzSdZyQinpX0ZlKH0bOABZJuIs3A+gRp8rG1SNOuTwDmAm+OiOd6GrWZmZmNGS2ng4+If0iaApwIvA14bUm250mzmh4REQ90N0QzMzMby9p6UV5uYLxD0v+SOrRuBKxIeiQzG/hNRMztWZRmZmY2ZrXVGKnJDY7TexSLmZmZDaBKjZFmJJ0IbJb/GxGxS7fKNjMzs7Gra40R4A/Ag6ROrR/tYrlmZmY2hnWtMRIRZwBImogbI2ZmZtamVvOMdCJ6UKaZmZmNUb1ojJiZmZm1zY0RMzMz66umjRFJFy+uQMzMzGwwtbozsuNiicLMzMwGVqvRNJL0j4pl+tGPmZmZta2dob1DHZTrETVmZmbWllaNkQDWA1ShzBVJE6CZmZmZtdTyMU3VF+BJWpFqjRczMzMbYE0bIxFRuf9HRMzH/UbMzMysTW40mJmZWV81bIxI2k7S9sMpXNKSkt4radnhlGNmZmZjV7M7I/cB50vao5OCJY0DfgTsGhFPdlKGmZmZjX0NGyMRcSvwPuA8ST+TtIOklo91JE2W9H7gVmAD4JCuRWtmZmZjTqsOrD+QNB84HbgUeFLSbOBuYB7wNLAksCywMmkY8Nqk0TTnAgdHxCO9C9/MzMxGu5aTnkXEBZJeDrwbeCfwSmDLBtnnkx7NnBQRl3UtSjMzMxuz2pmBlYh4DPgq8NU8j8jGwBrABOBZUiPkH8CtEfF8j2I1MzOzMaitxki9PI/ItT2IxczMzAaQ5xkxMzOzvnJjxMzMzPrKjREzMzPrKzdGzMzMrK/cGDEzM7O+cmPEzMzM+qpSY0TSkb0KxMzMzAZT1TsjR0lavyeRmJmZ2UCq2hgR8EdJv5W0j6TKk6aZmZmZ1eukz8hawHnAp4G7JB0vaYPuhmVmZmaDompj5KCImB8RJ0bEFsBepLf13ijpYklvl7R098M0MzOzsapSYyQivlv4/1URcRCwJvBL4DukuyVflLRR98I0MzOzsWrYQ3vzI5pPAh8DxgOTgX8HZkm6RNLuwz2GmZmZjV1Vh/Y+l9dLSXqbpN8BfwM+AowDTgFeFRFbAqsD3wdOlvSJ7oZtZmZmY0XV0TCS9AXgXaS+IgKuA04Dzo6IBbWMEfE4cLqkXwOzgOO6E7KZmZmNJZ0Mzf0oMB84GTgtIv7UIv8KeTEzMzNbRCeNkYOAcyLiiVYZJW0PXEJ6lGNmZma2iKqNkUuLI2pauBqYBDxb8ThmZmY2IKoO7d2pUZqkyZKWKOR/PiIej4inOg3QzMzMxraqo2m2k/RQXk4oJO9PmmNkn+6FZ2ZmZmNd1cc078r7HAOcU0g7B1gJOFPSvIi4qAvxmZmZ2RhXtTHyWuDdEfHDYkJE3AMcKWku8AnAjREzMzNrqeoMrGsDv2qR50fA5p2FY2ZmZoOmamNk6Tb2EbBMZ+GYmZnZoKnaGLkJOKxFnsOBVhOhmZmZmQHV+4ycDHxX0ibA2aTJzB4HlgM2BvYF3gEc2M0gzczMbOyq1BiJiO9Jeg3wPtJQ3iIB34iIs7oRnJmZmY19VR/TEBEfAN4M/AZ4EHgur38F7BkRH+xqhF0k6U2SLpD0O0nXSPqVpC36HZeZmdkg6+TdNETEL4BfdDmWxWEG8MGI+D6ApM8Dv5O0WUTc29fIzMzMBlTlOyPtkHRkL8rtgstqDZHsy8Bk4N/7FI+ZmdnA60ljBDhqODvn99ycIykkTWuRdwVJX5U0V9KTkmZL+pSkpYt5I+IthU21Nw97KLKZmVmfVH5MI+ktwDuBDYEJpE6rXSNpb+AkYFwbeVcAriS9GXhf4AZgd+BMYDtJe0bEc02K2BZ4EjhvuHGbmZlZZyo1RiQdAXw1//ch4DEguhWMpMOBTwMHA/uQ3oXTzOeAzYA3RsQVedvPJB0NfAl4L6lhU3Ys5WN9KiLuG370PTBlSus8s2b1Pg4zM7MeqvqYZjrwQ2C1iJgcEUMRsV5xofO7JTcBm0bEL1tllLQ8cChwD4tOUT+D1Ej6UJMijgPuiIgvdxaqmZmZdUPVxsjqwPQ27iQc00kwEXFFRDzcZvadgWWBayPiRXdnIuJBYDawgaSNijtKmg5sAhzUSZxmZmbWPVUbI7fRxl2PiOioMVJR7WV8cxqk17a/6KV9kg4F3gC8LSKelbS+pF17EqGZmZm1VLUx8nngY60ySWrWabRbVsvrRndS5uX1qrUNkvYFPknqa7K5pCnAbsAOvQrSzMzMmqs6mkbAppL+QBqBcjcLh8cubuPz+pkG6U/n9YS6bd8jnfPMQt7SOzmS3gO8B2CdddbpKEgzMzNrrmpjZEbdv7fM6+JoGpVs64VaI2iR+USy2tDgBbUNEdEob6mIOA04DWDKlCmL45zMzMwGTifTwbfq9Cng2x2UW9W/8npSg/SJee1p3s3MzEawyo2RiPhuqzySvtNZOJXclNfrNUgfKuQzMzOzEaiTob0tRUSvppmvdzHwFLBNnsDsBZJWAjYCbouI2YshFjMzM+tQpUZD2Ztt85Tsi11EPEp6HLQ6sEcheRrpcdEJizksMzMzq6ijOxiSDpF0uaTHSdPCI2lHSd+StGZXI2zuE8BfgNMk7SBpvKS9gKOBC4FTFmMsZmZm1oFKjRFJ4yT9mjTCZHvS8NraI5K7gSnAFZLaepxTUv5QflNvsPC9NGfkbXOK+SNiPrAd8GPgB6S5RY7Py54R8WwncZiZmdniU/XOyIeBfwM+C2wKrFhLiIi/A68ivTm35cRoZSJiTkSowTLUYJ/5ETE9ItaOiGUiYsOI+ExEPF2W38zMzEaWqo2R/YGDI+KoiPhr7rfxgoh4HjiSRftwmJmZmZWq2hhZHzi3RZ7bgbU6C8fMzMwGTdXGyBPAKi3ybEjdrKdmZmZmzVRtjFwFnNRoOK+kJYDjgMuHG5iZmZkNhqozsB4HXAbMlfQj4I8Akg4izYS6P+kRzXbdDNLMzMzGrkqNkYi4WtIBwLeAQ0gvxFP+v4DHgHdExA3dDtTMzMzGpk7eTXO2pEtI84BsTRreOw+4FjgzIu7vbohmZmY2lnXy1t7atPDHdzkW68SUKc3TZ81aPHGYmZl1qCcvtJP0370o18zMzMaeXr1d94s9KtfMzMzGmEqPaSRd3KtAzMzMbDBV7TMytc18UbFcMzMzG1CdjKZZ5NGOpPHAmsBbSG/ufVcxj/VJqw6u4E6uZmbWV1X7jHyzbGNEPBERt0bE8cAPcJ8RMzMza1OlxkhE/Fcb2S4C9uksHDMzMxs0vRhN8wpg+R6Ua2ZmZmNQ1dE0r2uSvBzwcuAIwJ0QzMzMrC1VO7DOpPlIGQH3A9M7DcjMzMwGSyfTwR9bsi2AJ4C/AxdGxIJhRWVmZmYDo5Ohvcf0IhDrI7/fxszM+qhqB9b1ehKFmZmZDayqjZEzOjmIp5E3MzOzRqo2Rnbs8Did7mdmZmZjXNU+I5L0XE8iMTMzs4FUtTFyGanfyNqkIbx3Ao+SJjlbG1gZ+EtOqxHwb8OO1MzMzMakqo2RjwLfAg6JiN8WEyXtBhyV0/9Rt/35YUVpZmZmY1bVxshngcMi4uqyxIi4SNIC4MvAXnVJHg5sZmZmpao2Rrah9VTv1wEvmjbec5OMcq3mIQHPRWJmZh2rOppmaWDjFnleASzTWThmZmY2aKo2Rq4HZkh6WVmipA2AGcA1w4zLzMzMBkTVxzSfBC4GbpH0B+BvwOOkN/ZuDGwFPA1M7WKMNhp4SnkzM+tQpcZIRFwlaQ/gdGBKXur9HTg0Iq7rUnxmZmY2xnXyorxLJG0IbE+6E7ICMB/4fURc1eX4zMzMbIyr3BgBiIgArsiLmZmZWcc6aowA5E6sU4BVI+JESZOApyJiQdeis7HDw4PNzKyBqqNpkLSppMuB2cD3ga/mpG2AuyQd0MX4zMzMbIyr1BjJQ3cvB7YjTX52bl3y5aQZWk+VtFPXIjQzM7MxrepjmqNJd0T2j4jbAGpv8c2PZ74s6SnSO2wu6WKcZmZmNkZVfUyzM3BwrSHSwNnAlp2HZGZmZoOkamPkpaS5RJp5NuczMzMza6lqY+RuYMcWeXYC7uwsHDMzMxs0VfuMnAucJel/gLMi4tn6REk7A98AzuxSfDZIujGlvKelNzMbdareGfkM8AjwHeARSbMAJF0maS5wUU7/f12N0szMzMasSo2RiHgY2IF0h2QZ4FWA8rY1gB8Dr4uIR7ocp5mZmY1Rnbyb5l5gb0mrkmZgXRGYB1wfEfd3OT4zMzMb4yo1RiR9J//z2og4Ffhl90MyMzOzQVK1z8g0YGtSvxAzMzOzYav6mOYZ4PURcXcvgjEzM7PBU/XOyG1AtMok6cDOwjEzM7NBU7UxcjJwRBv5zuggFjMzMxtAVR/T3AS8SdKFwDmkmVaf6HpUZmZmNjCqNkYuJj2mEbBL98MxMzOzQVN5nhHg2BbpAj7dQblmvddquvh2LK4p5RdHrKOpPhYH18fINFZe89DO9bU4zmWkxFGnk0nPjmmVR9KRnYVjZmZmg6ZqB9Zt28y3XtVAzMzMbDA1vTMiaZ3avyNibkRc206hEXHHcAMzMzOzwdDqMc2cvA5Jy0fEgh7HY2ZmZgOmnT4j6wFExIL6OyU1ETG361GZmZnZwGjVGInCI5c5LJyBVfnfS/YgLjMzMxsQimg8u7uk5yJiybr/r0tqhPyDhXdMBqJ/iKT7gW6f62TggS6XOQhcb9W5zjrjeuuM660zY73e1o2IlcsSKjVGWm23aiTNioguTGwwWFxv1bnOOuN664zrrTODXG9Vh/a2RdLrelGumZmZjT09aYwAl/SoXDMzMxtjetUYUY/KHWtO63cAo5TrrTrXWWdcb51xvXVmYOutVZ+R51k4eqYS9ykxMzOzdrQzz0gndzk6asCYmZnZ4Gn1mCYiYomqy2KJvIskrSDpq5LmSnpS0mxJn5K0dL9jq5G0v6TrJS2QdL+ksyW9rEn+cZKOkvT3fE53SPqSpJd0IRZJ2jPHMFfS05LmSbpM0gHDLb+bqtSbpO0kfUHSdZLuk/S4pL9JOlXSBj2Kb09JIWlENeBHYr1J2lXSeZLulfSUpDsl/VLSft06xnCM4Dq7IP+cPiHpVkn/J2njbh1juCTtLulSSY9KeljS+ZJe3SDvFpKOlnSFpLvzOd0m6fuStupyXJMlnZN/Pqd1s+xuqPo3obDvVpKeyec21NtI2xQRDRfg9mbp3d6vHwuwAnAT8E9gB2A8sBfwKHABsOQIiPFY0t2mjwPLAxsBVwDzgE1K8i8N/BaYD+yZz2lH4B7g98Byw4znUzmei4CtgAnAK4Cf5+3f6XeddVhvjwH/Av4TmJiXfYCHctpWPbj27swxRr/rayTXG3B0Pv6hwEuB5fLxHgF+7TorjenDOaaLgc3yz+k2wB+Bp4CpI6DeDs4xnpg/17WAnzaKD/gz8DjwLtKcHMsDrydNyPkMsEeX4tobuBd4OMc3rd91NZzrrbDvkqS/A5GXoX6fT0Q0b4wMwgJ8PX8gbyhsr/0gv68Hx5wJzGkz7xTgeeCcwvY1gSeBa0v2KY09/4AFcPww4/9s/kX6ksL2ccBt+Rg7j8J6ewzYr2T7h/I5fbfL53MScDU9boyM9noj/cEO4D9K0j4MnO46W6SccaSG2vPAKoW0rfMxFomrC/U2o91rmdTweAK4ltx/MW9fLv9+mQssU9jnz8DHS8raK5/TpV04h8OBu4E31s6HHjdGen29FfL9D3B7ruMR0xgZdY9UuknS8qRvWvcAvyokzyB9UB9azGEVTSf12/lO/caIuAv4DbCNpO1r2yUp7/MM8L1CWT8nffM6XNKyw4jpLtIvy8cKMT1NulsCsOswyu+GSvWW7UGqo6K/5/XEbgWXj30w8O5uldklI7HejgNuiYjzigkR8eWI6HcdjsQ6m0T6xvxARNxXSLs5r7cY5jGG6zBgWeCMyH8lASLiceCHwNrAWwv7HAScWlJWN39GbwI2jYhfdqGsXujkegMgP8Y5GngvqeEyYgx0YwTYmfTDcG39DwNARDwIzAY2kLRRbbukCbkvxi25L8YDks5t9IyzC96Q19eUpF2d12+s27YF6RvHzRHxaH3miHgWuB54CdDxxHQRcXJEfKxBcu2YL+r4PArqjYi4PMrfTP3avP5dNwKTNA44HfhiRPy5Rd6BrjdJryQ9Aryswj4DXWe5/HtJ3+4nS1qlkLxpXv+rfqOkpSQdIenG3BdjnqSLJPXqi0WtTq4uSWtUb9dHxEMl+bv2MxoRV0TEw+3mHw3XW51TgZ9ExIVdj2qYBr0xsnlez2mQXtu+OaSLjjSh28dIrcuJpGewSwFXStqlm8FJWov0DWdeRMxrFV/h33MoV7ZPN9Uabi/88Rgl9VYsY5yk9SR9jHRb8zTgm10K8VOkn73PNsvkegMW/pGZK+lASTfU/aH8jaQdC8d3nS00jdSH4GxJm0kaL2kb4Fs5/Rt1MSwB/AT4CvBtYGVgE9KjkgslHdiFeF4gaclcPpT/rqpta1ZvS0paS9JhwJdJd5qO7GKYLY2m603SwcCW9P9uf7l+Pyfq50L6YQzgyAbpZ1PX94J0wQdwbCHfiqQ7AnfQRodX2nw+SHo2GMA/GqTvntOvq9v2EZp0IgU+Txf6jTQo+6WkW3+/58XPgEd8vRXSN2Zh566HgfcDS3epjjbNdfRvddtK+4y43gLgi7m82/P57kLqU7A5cB3wLPA211nD+F5OulsQdcstwOGFfP+V084sbF8y1/0jwEvbON6Msmu5JN/kfLznW9TLfQ3Sl607nyeAo4AJ3aq34vnQoM/IaLnegFVJj+jfWbdtDu4zMmKMz+tnGqQ/ndcTJC3Fwuf736rPFBHzSX1O1iE9+lns8Q1zn245nnRxHxj5ah9F9VYf1y2kOxfrAB8l9Vy/ZrhD4PK3z9OBGRFxeYu8rrdkhbweIl1Xv4uIxyPiJmA/0uPAUyS9xHX2YpLeCswidXbcktSHZAfSl4UV892JmsPzulhvzwE/yvvuPdyY6gy33p6MCAGrAwcChwA35cd6i8Uou95OJDVQ/q+LsXTVoDdGnsjrRvOJjMvrBaSW+vLAwxExtyTvnXn9whsXJU3N47hftJCG2a5bliZpaofxDWefYZO0P+m28P7x4n4Qo6XeXiSSOyPiW/m8XgWc2Sh/m94PrEu6pduK6+3F7o2ISwvHuo303HwSsBuusxdIWo/Ugf1R4M0R8aeIeCwirgQ+ARwDnJPzLk/qlwNp2G9RWb0NNai3d+X0snqbVldmV35PRcS/IuJHwH+QGqw/lrRMs326aFRcb5L2JPUhOazyGS5G7czAOpbVOnBNapBe65l9L+m2G8AkNZ+gatW6fy8A/laSZx3SxXRbSVr9D1+V+Iazz7BI2o30zeA9EfHTQvJoqbeGIuIXku4D/k3SJhHxl3b2qydpbdKokAPzt6ZWXG9JrSNh2S97SLfBtwM2BGqjRga9zgDeTnqU8YsodJSNiDmSrgHeImk7Uh3WzJMaTrpdX2/PUF5vq5PuZpWl1V/3D5O+xY+TtGLJz0TVertR0ixSf41dSHNE9dqI/xnNDc2TgE9HxJwmMfbdoDdGbsrr9RqkD9Xlq7VE74qItdopPCKuI7WeX0TSTNJzuqazIEbEPyXNAyZKmhiLdliqj4/Cv9s5p2FT6mn/M+D9EfGdkiy1mEd6vbVyB7AKsAHQyR+IXUijmH7a6Jd93S+0S0l3UcD19te8bjUbcuBrrey49zRIr21/JQvvhgSwbKQh+k1FGkZaVm8zgHe1UW/PSfpLPv56wI0N4q9ab9uQ6m1xGA3X26tJoyu/IukrDYq9Pf9OuiMihhrk6blBf0xzMWmmv21U+AshaSXSyJDbImI2qdPXfGD1stuAkpaQ9Hql3s7dVGvhv7YkbdtCHoA/keYB2SS3iutjXIo04dFjVBgq2YhST/FzgSPqGyKSNpX09vzfUVFvkg6QdEOT8lbP60c6CSYiZkSEypa6PLVtU3G91dQ6X66r1OemaN28vgXXWb0HC2UVrZHXz0Sa1+NmUv+bdcoyS9pJ0obDiKdM1Xp7naT7m5TXjXqrYsRfbxExs8nvndodsfXytqEux1nJQDdGIs3D8W3SRbxHIXka6YfzhJz3OdKQuyWAsvev7EW6ALp9t+mEvD64fqOkNUjTIM+KiCtq23PH0a+RvkkW43wzacTLqRExrAlvJO1MGko3PSK+XUjemtwhbrTUG2nUwOb5WTuFfaaSvl08SPnY/q5zvSUR8U9Sg3cSC+dXqJW/PukX8t3ARa6zFzk/r98kaXx9gqR1gdeQOrZekjeflNfTSmKaQvri1qhh06lTSCPLDqr/MihpOeBtpFd0/Lgu/xKkeVMWmdBL0stJd0XqJ17sqVF0vY0OMQKG9PRzIT33u5nyd9P8BliqLu944CpSy/tg0rPAScA7SLfsjmrzmDNpc+rfnP9zpG+H/0vqMLUh6T0E84HNSvIvTfolU/ZumhspTOPeQZ3tRHqOeQ9p+HNxuRaYOZrqjdTxLkjTTe+e45sM7Et6TvsMsHePrsFGQ3tdb+kYa5L6jNxBmqxvHOldK9fm63AX11lpTCfl4/yaNBR6OdI35z/k7Z+ty1ubZ+Qp0vQAa+XzeFOu+zPaPOaMsmu5Sf5351hq76ZZk/RumqfrP9ecd8ecdy6psbIy6ff3G4Bbc9r0bv1c1p8PjYf2jvjrrUkZcxhBQ3v7HsBIWPIFfQKp954c5ikAAAvYSURBVPNTpKmFPw2MK8m7LPBJUgPmSeB+0jP+fSocr9KFl/c5ALiB1Jv6AVJP+A2a5F+G1GP+tnxOc0lj4pfvQn3VfkCbLTNHU72RGnBvAs4izavwVF7mkEY2vLLL19y0JnU31fW2yHFWAU7OP6NPk/5of580bfeo+hldnNcasH8+l3mkOVkeIH3J2qsk75LA+/I5LCB1Mr2O1GBYos3jzaBCYyTvswdwOenx8TzS3YQpJfkETCUNj5+dY3yahXdQduxSnQ01+dlc5JoY6ddbYZ+pTc5tWreuu04W5QDNzMzM+mKg+4yYmZlZ/7kxYmZmZn3lxoiZmZn1lRsjZmZm1ldujJiZmVlfuTFiZmZmfeXGiJmZmfWVGyM2akma0+CV249LukPSTyUd3OyV4pIuyOWsXJK2gqQTJN0u6UlJ9+QyJ+b010u6QtLDkh6TdKOkA3t5zmORpI9Lmidppx4eo9G10mgZ6lUsg2xxfNY2OrkxYqNWRAxFyYvmSC8BeytpiuivA38ve59Fth5pds/lStK+QXp77ntIr+behzRz5kRJrwDOI818uBFpGuvZwM5dOLVBsw5pFuRVenWARtdKcaHx266tO3r+Wdvo5BlYbdSTlF7wUvfHpi5tC+C3wAqkd11cWUgfD4yPiIdK9n0QuD0iptRtexlpav3DSO/T2DMizs9pKwITIqLRa9utRH4b72oRcfdiOFbDayWnD5GmaF8vIub0Op5Bszg/axtdfGfExrSI+BPp3RrLAN+XNK6Q/kRZQyR7Kel9GfX5b4uIZ4CV8qbH6tLmuyFSXUQ8P4L+ON1BevHYHa0yWnUj7LO2EcSNERvzIuLnwC2kW8T7QXpVe6GPwNRafkkza9+ggR3r8syo7QccldMvKetnIGltSd+SdJekpyTNlXSypNXqY5N0S6H8KZIukvRIbXsh/4GSrsl9VB6VdKWktxfy7Fs4tw0lHZdjeVLSHyTtWlZXkiZI+rSkv+a89+bjHSdp/ZL8LeNpphDn0XXb/7eQtrykUyTdL+kJSZdLelW7x2kjjqFc1+tGxGMREZJeW4hhJ0kfzHXzdO0zqyvjpZK+otTH6ClJ/5J0lqQNGxxzO0mXKPVxmp8/921q119eZtZia3CtfqM+b4PjbCvpfEkP5c/0ZkmfUF1fKkmrFY4xLX+2f8nnMkfS9Cb1d4Ckq/N1ME/SnySdprrHo40+67r0CZKOyj8TT0p6QNK5kl5dkvelkr4gabakBUo/Xxco9RGb0ChOG8H6+ZY+L166sZDfOtkiz9dzvh8Uth9N4U25hXJnNiiv2X6vAO4j9Vl5DemuzLakt0HPBdYo5B/KZV0FXAa8GngJ8IX68yL1YQnS68Mnke7OfDZv+1RJHDNy2vnAoaR+L5uSXl2/AFi7kH8CcC3pzal7k16PvjrwmVzOuYX8leJp8tlMzfscXZI2M6f9GNiL9LhtW+Au0pt7l+vGtVL3GQw1+awvBI4n9Q9ai/SW1hk5z2r5870feH3+zDfL9fkwsHmhzJ1Ib+q9EXhlzr898Kd8jZTFWPlaBfYlva33gnyO40kN8gX5fJYo5J+Wy7oI+Bqp/9XqwNl5+1tLjlG7Dj5Jups4MR/jUWBeO5913bW3IMe8LLA+6dp9kvSItZZXwPWkt/XukM9pLeCURvXjZeQvfQ/Ai5fhLo3+wBTyTM/5ri9sr/wLvo39ZuW0nQvbd87bzypsr/0hfJb0zby2fS3g7PzvPXOey0qOd1ned+PC9hl5nxML29+Wt08vbP9y3v7ukmOcT11jpJN4mnw2pX+gctrMnPbfhe3/k7f/ZyfXSpNlqMln/dvC9v2Bw/O/f5LzHFzIswHwPHBl3bYlSA3V54CNCvl3a3Q9V71WSQ2kx4BHgImFtGPLPmsWNkb+WNi+KuUN0tp1cFZJTB+h/cZI7do7trB9RVKj5g5gybxti5z3S4W8yvkWqR8vI3/xYxobFLW+HSv28iCStiHd2bg9Ii6uT8v/vx94q6SXlOx+XUTcUZf/nxGxb/7v4Xn9rZL9zgaWBA5oENbPC///a16/8PhA0lKkvjUB/LCkjK+Rvi3XDCeeTrQ8hyqis1E05xTKOCsiao/e9iLd6fi/Qp5bSY3T7eoe12wPvAz4Q0TMLuS/iHSNdMOBpFFiP46IeYW0s/N6WoN9zyvEdS/wEIvWd+06+EFJGecAZ7YKsu7ag8L1FBHzgV+RHrHWRqpFXu8qaaW6vAFsR7rDYqOMGyM2KJbP6/k9Ps42eX1jg/Q7gXHA5g3SOim3tt+UkjSAYofBWsOs/tn6y0l1dE9EPFIsICIuiohvdimeTrRzDr3W6POZQvpW/peIeLrJfrX62Cqv/1bxOFW18xm9SmmES1FZJ9PHWLS+t87rRc4lIuZGxAfbiHNj0rX3cETMbRJrrf7+TBohtyVwu6RvS9pD0tIRcVdEPNHGMW2EWarfAZgtJhvl9eymuYavdudlLxU6nxasWrKt2S/RWrl/lEpHpTYqs6zcWlz1BU3M68ebxNCteCor+QNTdg7DKX9OG2U1+nxqdbFVm5/5CnndqK4fbRFHu2pxfU3S1xrkWTbHU7xzUnauwaJ1VPW6KVOLc1I79RcRIekNpDmADgcOzsu9kr4IfCXfJbFRxI0RG/OU/lrWRo/8pseHq/1SPysi3tnlcicDG+Zb/91Wi7ts8rd+xDOa1OruyojYoY38tbtzjep6+QbbS//ANhk9Uovr3RFR9jitG2rXQbvXTaMyAO6KiLXa2SHS8PoTgBMkbQW8E3gv8CVSh9bPDiMe6wM/prFBsA+pI+FcFj4r75Xa8+qhskRJkyXt3sHww1blvlZpgrdO3ULq6Li60uRtxfK3kfS+xRjPaHI9qZPqUFmipOXyZ17r3/D7vN64QXlrN9heu1tR/MO/ZoP8rT6jjeqHCXfourxe5FwkrSFpuqRWs63eQmqgra6SVzdIWkLp1Qtr5f9PlvS6WnpE/CEiPgzsnje9tZMTsf5yY8TGNKW5KE4mdS7cr8Ez/a6JiFmkX9DbStqoJMtRpKGQT1Ys+qS8nlZMkLQmadTJKyuW+YKIeA44nXQb/m0lWb7EwrtLPY9nNImI+0ijadZU+fwth5E6BT+V/38VaRjwVsVrRNJuwCLvScr+ntcvL2zfq0H+M0n9PPbPnUTrj7MkqYPpcN+ldHJe71uSdhjwcVLH14bytXca6e9RWafnvUhDk2vnsBlwdj6HejfntfuMjEJujNiYo/SCu60lfQm4gvQ8e9eIuGoxhXAgaUTE+ZJ2VZqwa4080dO7gfdFxPNVCoyIC0i3pd+RJ3taP08StSNptMGllI9oqOJIUkPqi5L2ljRe0lqSvk7qcPuJxRzPaPIBUn+k70l6i6RJklaW9AHSPCxHRMRj8MKoj0OBp4FzJG0paVyeIOxo4C8NjnEh6bo6It+pWl7SW1jYifRFciPpINIQ8fPycSZI2oTUEJnEwsn7OhLpVQjfIF0Hn8znPVHSocDH8nk/20ZRRwFXA1/JE5etmst6B/Bt0pDfOXX5VwdOl/QyScsoTcj3jZzWqH+MjWT9HlvsxUunCzCH8rkiFpAeyZwLHAIsU7Lv1LJ9c9rMBuUONdqPPPlVXflrkL41ziV9I76TNHHXlEK+smPNbHLO+5EaWLX5I/5Ims9hfItzm9nkeFPr9p1AapT8lXT35m7gR8ArOo2nxWdYVpdTWTjfxSJ13OBzH+rwWpnaZJ+hBvtMa5B/ImmiulvzZ34P8GtSQ7gs/2uBi0nX68Ok4csb1D6jBvtMyfX9JKlhchLpsU19fB8p7LM16WfhQdJdg7+T3qu0ZqvPIm8/uiTt6MK+B5AaE4+T7oRcCuzezmddl74saeK0m+vO71Jgn0I54/P1cUH+XGvX6a+omxzNy+ha/KI8M7MRRGla9x2jwcv8zMYiP6YxMzOzvnJjxMzMzPrKjREzsxFA0tF50q8d8/8bvonXbKxxnxEzMzPrK98ZMTMzs75yY8TMzMz6yo0RMzMz6ys3RszMzKyv3BgxMzOzvnJjxMzMzPrq/wPAinxlvlWnKwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 576x252 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# mismatch script\n",
    "vocab_match_no_frequency_iv = \\\n",
    "    generate_vocab_match_no_frequency_iv(task_token_by_length, \n",
    "                                       task_token_frequency_map)\n",
    "# plot the frequency distribution afterwards (the diff)\n",
    "plot_dist(vocab_match_no_frequency_iv, task_token_frequency_map, task_token_frequency_map, \n",
    "          facecolor='r', post_fix=\"mismatched\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# write to the disk if things looking good!\n",
    "corrupted_train_dataset = train_df.map(partial(random_corrupt, \n",
    "                                               task_name,\n",
    "                                               modified_basic_tokenizer, \n",
    "                                               vocab_match_no_frequency_iv))\n",
    "corrupted_validation_dataset = eval_df.map(partial(random_corrupt, \n",
    "                                                   task_name,\n",
    "                                                   modified_basic_tokenizer, \n",
    "                                                   vocab_match_no_frequency_iv))\n",
    "corrupted_test_dataset = test_df.map(partial(random_corrupt, \n",
    "                                             task_name,\n",
    "                                             modified_basic_tokenizer, \n",
    "                                             vocab_match_no_frequency_iv))\n",
    "\n",
    "corrupted_datasets = DatasetDict({\"train\":corrupted_train_dataset, \n",
    "                                  \"validation\":corrupted_validation_dataset, \n",
    "                                  \"test\":corrupted_test_dataset})\n",
    "corrupted_datasets.save_to_disk(f\"../data-files/{FILENAME_CONFIG[task_name]}-corrupted-mismatched\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "original tokens: ['We', 'walked', 'in', 'to', 'pick', 'our', 'little', 'man', 'at', '10', 'minutes', 'to', 'closing', 'and', 'heard', 'laughter', 'from', 'kids', 'and', 'the', 'staff', '.']\n",
      "scrambled tokens: ['northward', 'Darfur', 'Bert', 'stink', 'Minimum', 'descriptive', 'Ã³l', 'gunning', 'Turns', 'discomfort', 'TERRIBLE', 'stink', 'Washington', 'passcode', \"Ham's\", 'blurred', 'human', '15', 'passcode', 'agree', 'faction', 'Goldman']\n",
      "reordered scrambled tokens: ['northward', 'gunning', 'blurred', 'Turns', 'faction', 'stink', 'TERRIBLE', 'discomfort', 'passcode', 'human', 'stink', 'Darfur', 'passcode', 'Bert', '15', 'Goldman', 'Minimum', 'Ã³l', \"Ham's\", 'descriptive', 'Washington', 'agree']\n"
     ]
    }
   ],
   "source": [
    "# pick out an example sentence to show the scrambling.\n",
    "for name in TASK_CONFIG[task_name]:\n",
    "    if name != None:\n",
    "        example_sentence = example[name]\n",
    "        print(f\"original {name}: {example_sentence}\")\n",
    "        if task_name != \"conll2003\" and task_name != \"en_ewt\":\n",
    "            corrupted = corrupt_translator(example_sentence, modified_basic_tokenizer, vocab_match_no_frequency_iv)\n",
    "            print(f\"scrambled {name}: {corrupted}\")\n",
    "            unordered = corrupted.split(\" \")\n",
    "            random.shuffle(unordered)\n",
    "            unordered = \" \".join(unordered)\n",
    "            print(f\"reordered scrambled {name}: {unordered}\")\n",
    "        else:\n",
    "            corrupted = [vocab_match_no_frequency_iv[t] for t in example_sentence]\n",
    "            print(f\"scrambled {name}: {corrupted}\")\n",
    "            random.shuffle(corrupted)\n",
    "            print(f\"reordered scrambled {name}: {corrupted}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Step 3. Not Even Word, Go Symbolic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# mismatch script\n",
    "vocab_match_abstract = \\\n",
    "    generate_vocab_match_abstract(task_token_frequency_map)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# write to the disk if things looking good!\n",
    "corrupted_train_dataset = train_df.map(partial(random_corrupt, \n",
    "                                               task_name,\n",
    "                                               modified_basic_tokenizer, \n",
    "                                               vocab_match_abstract))\n",
    "corrupted_validation_dataset = eval_df.map(partial(random_corrupt, \n",
    "                                                   task_name,\n",
    "                                                   modified_basic_tokenizer, \n",
    "                                                   vocab_match_abstract))\n",
    "corrupted_test_dataset = test_df.map(partial(random_corrupt, \n",
    "                                             task_name,\n",
    "                                             modified_basic_tokenizer, \n",
    "                                             vocab_match_abstract))\n",
    "\n",
    "corrupted_datasets = DatasetDict({\"train\":corrupted_train_dataset, \n",
    "                                  \"validation\":corrupted_validation_dataset, \n",
    "                                  \"test\":corrupted_test_dataset})\n",
    "corrupted_datasets.save_to_disk(f\"../data-files/{FILENAME_CONFIG[task_name]}-corrupted-abstract\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "corrupt_translator(\"this movie is great\", modified_basic_tokenizer, vocab_match_abstract)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pick out an example sentence to show the scrambling.\n",
    "for name in TASK_CONFIG[task_name]:\n",
    "    if name != None:\n",
    "        example_sentence = example[name]\n",
    "        print(f\"original {name}: {example_sentence}\")\n",
    "        if task_name != \"conll2003\" and task_name != \"en_ewt\":\n",
    "            corrupted = corrupt_translator(example_sentence, modified_basic_tokenizer, vocab_match_abstract)\n",
    "        else:\n",
    "            corrupted = [vocab_match_abstract[t] for t in example_sentence]\n",
    "        print(f\"scrambled {name}: {corrupted}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### *Step 4: Maybe let us mismatch with out-of-vocab English tokens. Essentially the frequency is 0 in this dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# mismatch script\n",
    "vocab_match_oov = generate_vocab_match_no_frequency_oov(\n",
    "    wiki_token_frequency_map,\n",
    "    task_token_frequency_map,\n",
    "    match_high=True, \n",
    "    match_similar=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# write to the disk if things looking good!\n",
    "corrupted_train_dataset = train_df.map(partial(random_corrupt, \n",
    "                                               task_name,\n",
    "                                               modified_basic_tokenizer, \n",
    "                                               vocab_match_oov))\n",
    "corrupted_validation_dataset = eval_df.map(partial(random_corrupt, \n",
    "                                                   task_name,\n",
    "                                                   modified_basic_tokenizer, \n",
    "                                                   vocab_match_oov))\n",
    "corrupted_test_dataset = test_df.map(partial(random_corrupt, \n",
    "                                             task_name,\n",
    "                                             modified_basic_tokenizer, \n",
    "                                             vocab_match_oov))\n",
    "\n",
    "corrupted_datasets = DatasetDict({\"train\":corrupted_train_dataset, \n",
    "                                  \"validation\":corrupted_validation_dataset, \n",
    "                                  \"test\":corrupted_test_dataset})\n",
    "corrupted_datasets.save_to_disk(f\"../data-files/{FILENAME_CONFIG[task_name]}-corrupted-matched-oov\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Step 5: We need to prepare some csv file for LSTM model training as well!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Reusing dataset universal_dependencies (../tmp/universal_dependencies/en_ewt/2.7.0/1b298bc956ef19f298a0e2970bf4f9696fd258cb2a749b30af709abfa09f2983)\n"
     ]
    }
   ],
   "source": [
    "condition = None # None, matched, mismatched, abstract (which is not included in the paper)\n",
    "task_name = \"en_ewt\"\n",
    "if condition is not None:\n",
    "    output_dir = f\"../data-files/{FILENAME_CONFIG[task_name]}-corrupted-{condition}\"\n",
    "else:\n",
    "    output_dir = f\"../data-files/{FILENAME_CONFIG[task_name]}\"\n",
    "\n",
    "if task_name == \"conll2003\" and condition is None:\n",
    "    datasets = load_dataset(\"conll2003\", cache_dir=cache_dir)\n",
    "elif task_name == \"en_ewt\" and condition is None:\n",
    "    datasets = load_dataset(\"universal_dependencies\", \"en_ewt\", cache_dir=cache_dir)\n",
    "else:\n",
    "    datasets = DatasetDict.load_from_disk(output_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "if \"train\" in datasets:\n",
    "    train_examples = []\n",
    "    columns = []\n",
    "    for field in TASK_CONFIG[task_name]:\n",
    "        if field is not None:\n",
    "            columns.append(field)\n",
    "    \n",
    "    for example in datasets[\"train\"]:\n",
    "        example_dict = {}\n",
    "        for field in TASK_CONFIG[task_name]:\n",
    "            if field is not None:\n",
    "                example_dict[field] = example[field]\n",
    "        # label\n",
    "        if task_name != \"conll2003\" and task_name != \"en_ewt\":\n",
    "            example_dict[\"label\"] = example[\"label\"]\n",
    "        else:\n",
    "            if task_name == \"conll2003\":\n",
    "                example_dict[\"label\"] = example[\"ner_tags\"]\n",
    "            elif task_name == \"en_ewt\":\n",
    "                example_dict[\"label\"] = example[\"upos\"]\n",
    "                \n",
    "        train_examples.append(example_dict)\n",
    "    columns.append(\"label\")\n",
    "    write_tsv(train_examples, output_filename=os.path.join(output_dir, \"train.tsv\"), fieldnames=columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "if \"validation\" in datasets:\n",
    "    validation_examples = []\n",
    "    for example in datasets[\"validation\"]:\n",
    "        example_dict = {}\n",
    "        for field in TASK_CONFIG[task_name]:\n",
    "            if field is not None:\n",
    "                example_dict[field] = example[field]\n",
    "        # label\n",
    "        if task_name != \"conll2003\" and task_name != \"en_ewt\":\n",
    "            example_dict[\"label\"] = example[\"label\"]\n",
    "        else:\n",
    "            if task_name == \"conll2003\":\n",
    "                example_dict[\"label\"] = example[\"ner_tags\"]\n",
    "            elif task_name == \"en_ewt\":\n",
    "                example_dict[\"label\"] = example[\"upos\"]\n",
    "        validation_examples.append(example_dict)\n",
    "        \n",
    "    write_tsv(validation_examples, output_filename=os.path.join(output_dir, \"dev.tsv\"), fieldnames=columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "if \"test\" in datasets:\n",
    "    test_examples = []\n",
    "    for example in datasets[\"test\"]:\n",
    "        example_dict = {}\n",
    "        for field in TASK_CONFIG[task_name]:\n",
    "            if field is not None:\n",
    "                example_dict[field] = example[field]\n",
    "        # label\n",
    "        if task_name != \"conll2003\" and task_name != \"en_ewt\":\n",
    "            example_dict[\"label\"] = example[\"label\"]\n",
    "        else:\n",
    "            if task_name == \"conll2003\":\n",
    "                example_dict[\"label\"] = example[\"ner_tags\"]\n",
    "            elif task_name == \"en_ewt\":\n",
    "                example_dict[\"label\"] = example[\"upos\"]\n",
    "        test_examples.append(example_dict)\n",
    "        \n",
    "    write_tsv(test_examples, output_filename=os.path.join(output_dir, \"test.tsv\"), fieldnames=columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Playground"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# anything you want to demo!"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
