{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "M1akI4IHTWy5"
   },
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import datasets\n",
    "import numpy as np\n",
    "import shap\n",
    "import pandas as pd\n",
    "import os\n",
    "import warnings\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm\n",
    "\n",
    "%matplotlib inline\n",
    "warnings.filterwarnings('ignore') \n",
    "np.random.seed(0)\n",
    "\n",
    "physical_devices = tf.config.list_physical_devices('GPU')\n",
    "tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "############## CONFIG ##############\n",
    "NUM_ADV = 500    # Number of adversarial samples to generate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "E5X9_82GTlDy"
   },
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "dataset = load_dataset('imdb')\n",
    "\n",
    "x_train = np.array(dataset['train']['text'])\n",
    "y_train = np.array(dataset['train']['label'])\n",
    "\n",
    "x_test = np.array(dataset['test']['text'])\n",
    "y_test = np.array(dataset['test']['label'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ay-x-s_FYOBK"
   },
   "source": [
    "# Tokenizing and padding of the data\n",
    "Before we can use the data to train our model, we need to convert the samples from a string to a integer representation. To do this, we use the Keras **Tokenizer** which automatically determines word to index correspondences.\n",
    "Next, the inputs need to be **padded** to have equal length. We use a max length of 100, but other choices like 128 are also quite common."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "5bFsPfcYVJkG"
   },
   "outputs": [],
   "source": [
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "from tensorflow.keras.utils import to_categorical\n",
    "\n",
    "max_len = 100\n",
    "\n",
    "tokenizer = Tokenizer()\n",
    "tokenizer.fit_on_texts(x_train)\n",
    "\n",
    "x_train = tokenizer.texts_to_sequences(x_train)\n",
    "x_train = pad_sequences(x_train, maxlen=max_len)\n",
    "\n",
    "x_test = tokenizer.texts_to_sequences(x_test)\n",
    "x_test = pad_sequences(x_test, maxlen=max_len)\n",
    "\n",
    "y_train = to_categorical(y_train)\n",
    "y_test = to_categorical(y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ws5cMMqFZBos"
   },
   "source": [
    "# GloVe Embeddings\n",
    "In the first layer of out network we convert the representation of the data into a different space using an embedding layer. While there are many options, we decided to use the GloVe embeddings from the Stanford University since they give consistently good results. The embeddings store semantic word similarities and were calculated on a massive Wikipedia dump.\n",
    "Code has been borrowed from the [Keras documentation](https://keras.io/examples/nlp/pretrained_word_embeddings/).\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "GYfKvpvEe1KW"
   },
   "outputs": [],
   "source": [
    "path_to_glove_file = 'data/glove.6B.200d.txt'\n",
    "embeddings_index = {}\n",
    "with open(path_to_glove_file) as f:\n",
    "    for line in f:\n",
    "        word, coefs = line.split(maxsplit=1)\n",
    "        coefs = np.fromstring(coefs, \"f\", sep=\" \")\n",
    "        embeddings_index[word] = coefs\n",
    "\n",
    "print(\"Found %s word vectors.\" % len(embeddings_index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "YHIRtDmzbMhJ"
   },
   "outputs": [],
   "source": [
    "num_tokens = len(tokenizer.index_word) + 2\n",
    "embedding_dim = 200\n",
    "hits = 0\n",
    "misses = 0\n",
    "\n",
    "# Prepare embedding matrix\n",
    "embedding_matrix = np.zeros((num_tokens, embedding_dim))\n",
    "for i, word in tokenizer.index_word.items():\n",
    "\n",
    "    embedding_vector = embeddings_index.get(word)\n",
    "    if embedding_vector is not None:\n",
    "        # Words not found in embedding index will be all-zeros.\n",
    "        # This includes the representation for \"padding\" and \"OOV\"\n",
    "        embedding_matrix[i] = embedding_vector\n",
    "        hits += 1\n",
    "    else:\n",
    "        misses += 1\n",
    "print(\"Converted %d words (%d misses)\" % (hits, misses))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pEehNY8Zanuf"
   },
   "source": [
    "# Model definition\n",
    "Next, we need to define our Keras model. For the architecure we decided to closely follow the [Bi-LSTM](https://github.com/QData/TextAttack/blob/a029964dc736f7b073f0cff83ad8c47e820f8d18/textattack/models/helpers/lstm_for_classification.py) from the textattack libary."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "AzAiUJAqfIr6"
   },
   "outputs": [],
   "source": [
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional\n",
    "\n",
    "model = Sequential()\n",
    "\n",
    "# Fails with numpy >= 1.2\n",
    "model.add(Embedding(num_tokens, embedding_dim, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix), input_length=max_len, trainable=True))\n",
    "model.add(Dropout(0.3))\n",
    "model.add(Bidirectional(LSTM(150//2, dropout=0.3)))\n",
    "model.add(Dropout(0.3))\n",
    "model.add(Dense(2, activation='sigmoid'))\n",
    "\n",
    "model.compile('adam', 'binary_crossentropy', metrics=['accuracy'], run_eagerly=True)\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "w8QOsygEfXck"
   },
   "outputs": [],
   "source": [
    "# Train the model\n",
    "from tensorflow.keras.callbacks import ModelCheckpoint\n",
    "\n",
    "mcp_save = ModelCheckpoint('models/model_IMDB.h5', save_best_only=True, monitor='val_loss', mode='min')\n",
    "probas = model.fit(x_train, y_train,\n",
    "          batch_size=128,\n",
    "          epochs=5, validation_split=0.1, callbacks=[mcp_save])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "S450CxWdWHOJ"
   },
   "outputs": [],
   "source": [
    "model.evaluate(x_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "n0kfXC7NbgTv"
   },
   "source": [
    "# Generate the adversarial examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from textattack.models.wrappers import ModelWrapper\n",
    "\n",
    "class KerasModelWrapper(ModelWrapper):\n",
    "    def __init__(self, model, tokenizer):\n",
    "        self.model = model\n",
    "\n",
    "    def __call__(self, text_input_list):\n",
    "        text_input_list = tokenizer.texts_to_sequences(text_input_list)\n",
    "        text_input_list = pad_sequences(text_input_list, maxlen=100)\n",
    "        text_array = np.array(text_input_list)\n",
    "        preds = self.model(text_array)\n",
    "        return preds.numpy()\n",
    "\n",
    "    def get_grad(self, text_input):\n",
    "        raise NotImplementedError()\n",
    "\n",
    "\n",
    "model_textattack = tf.keras.models.load_model('models/model_IMDB.h5')\n",
    "model_textattack = KerasModelWrapper(model, tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from textattack.datasets import HuggingFaceDataset\n",
    "from textattack.attack_recipes import PWWSRen2019\n",
    "from textattack.attack_results.successful_attack_result import SuccessfulAttackResult\n",
    "from textattack.goal_function_results.goal_function_result import GoalFunctionResultStatus\n",
    "\n",
    "SUCCEEDED = 0\n",
    "\n",
    "dataset = HuggingFaceDataset(\"imdb\", None, \"test\", shuffle=True)\n",
    "attack = PWWSRen2019.build(model_textattack)\n",
    "\n",
    "adv_x_test = []\n",
    "adv_y_test = []\n",
    "\n",
    "results_iterable = attack.attack_dataset(dataset, indices=range(NUM_ADV))\n",
    "for result in tqdm(results_iterable, total=NUM_ADV):\n",
    "  attr_adv = result.perturbed_result\n",
    "  attr_org = result.original_result\n",
    "    \n",
    "  if attr_adv.goal_status == GoalFunctionResultStatus.SUCCEEDED:\n",
    "    adv_x_test.append(result.perturbed_text())\n",
    "    adv_y_test.append(attr_org.ground_truth_output)\n",
    "    \n",
    "    \n",
    "adv_x_test = np.array(adv_x_test)\n",
    "adv_y_test = np.array(adv_y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adv_x_test = tokenizer.texts_to_sequences(adv_x_test)\n",
    "adv_x_test = pad_sequences(adv_x_test, maxlen=max_len)\n",
    "\n",
    "adv_y_test = to_categorical(adv_y_test, num_classes=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "vrEv7Szb7a8-"
   },
   "outputs": [],
   "source": [
    "# Simple sanity check on the adversarial data. This should give close to 0% accuracy\n",
    "model.evaluate(adv_x_test, adv_y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "OrMfcXWjb1pa"
   },
   "source": [
    "# SHAP value generation and visualization\n",
    "With out pretrained model we can generate the SHAP values for a single sample using SHAP's [DeepExplainer](https://shap.readthedocs.io/en/latest/generated/shap.DeepExplainer.html) implementation. Before doing this we need to calculate the expected value of the dataset using 100-1000 examples of the training set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "fuZmvIV8Xtxm"
   },
   "outputs": [],
   "source": [
    "# https://github.com/slundberg/shap/issues/1556\n",
    "# SHAP still has problems with TF >= 2.0\n",
    "import tensorflow as tf\n",
    "tf.compat.v1.disable_v2_behavior()\n",
    "tf.compat.v1.disable_eager_execution()\n",
    "\n",
    "from tensorflow import keras\n",
    "model = keras.models.load_model('models/model_IMDB.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "tSb6ebSs2DVV"
   },
   "outputs": [],
   "source": [
    "background = x_train[np.random.choice(x_train.shape[0], 500, replace=False)]\n",
    "explainer = shap.DeepExplainer(model, background)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "89vSU3c0q-bj"
   },
   "outputs": [],
   "source": [
    "from time import time\n",
    "from tqdm import tqdm\n",
    "\n",
    "shap_values_list_org = []\n",
    "shap_values_list_adv = []\n",
    "max_samples = min(adv_x_test.shape[0], x_test.shape[0])\n",
    "\n",
    "for i in tqdm(range(0, max_samples)):\n",
    "    # Compute the SHAP values for the new sentence\n",
    "    shap_values_org = explainer.shap_values(x_test[i:i+1])\n",
    "    shap_values_org = np.swapaxes(shap_values_org, 0, 1)\n",
    "    \n",
    "    shap_values_adv = explainer.shap_values(adv_x_test[i:i+1])\n",
    "    shap_values_adv = np.swapaxes(shap_values_adv, 0, 1)\n",
    "       \n",
    "    shap_values_list_org.extend(shap_values_org)\n",
    "    shap_values_list_adv.extend(shap_values_adv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "rk0uHdi3WHOK"
   },
   "outputs": [],
   "source": [
    "# Reshape to dimension (max_samples, num_classes*padded_length) and save to disk\n",
    "shap_values_list_org = np.reshape(shap_values_list_org, (max_samples, 200))\n",
    "shap_values_list_adv = np.reshape(shap_values_list_adv, (max_samples, 200))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "iG0xz18bGQ4a"
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "data = np.concatenate((shap_values_list_org, shap_values_list_adv))\n",
    "\n",
    "org_labels = np.zeros((max_samples,), dtype=np.int16)\n",
    "adv_labels = np.ones((max_samples,), dtype=np.int16)\n",
    "\n",
    "labels = np.concatenate((org_labels, adv_labels))\n",
    "\n",
    "# Concatenate both original and adversial examples\n",
    "\n",
    "from sklearn.utils import shuffle\n",
    "X, Y = shuffle(data, labels, random_state=0)\n",
    "\n",
    "np.save('data/IMDB_shapvals.npy', X)\n",
    "np.save('data/IMDB_labels.npy', Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "Keras_IMDB.ipynb",
   "private_outputs": true,
   "provenance": [],
   "toc_visible": true
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
