{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import jailbreakbench as jbb\n",
    "import os\n",
    "import pandas as pd\n",
    "\n",
    "import numpy as np\n",
    "from sklearn.svm import SVC\n",
    "from scipy.special import softmax\n",
    "\n",
    "from SPD.main_spd import calculate_logits\n",
    "\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. Save the logits\n",
    "For each dataset, calculate and save logit values in seperate folders. Hyperparameters such as temperature, top-p, top-k can be modified by altering the SPD/config_spd.py file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Location to save the data\n",
    "write_path = \"logits/gcg/test\"\n",
    "\n",
    "# Load the data\n",
    "model_name = \"llama-2-7b-chat-hf\"\n",
    "data_path = \"data/Llama2/GCG.json\" \n",
    "data = pd.read_json(path_or_buf=data_path)[model_name]\n",
    "\n",
    "ind_to_read = [*range(10)]\n",
    "prompts = {}\n",
    "for i in range(len(ind_to_read)):\n",
    "    prompts[i] = data[ind_to_read[i]][\"prompt\"]\n",
    "\n",
    "all_prompts = {model_name: prompts}\n",
    "    \n",
    "# Saves the logit values in write_path/model_name\n",
    "calculate_logits(write_path, all_prompts, llm_provider=\"vllm\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. Load training data\n",
    "\n",
    "Load the saved logit values and prepare the feature vector. While $r$ determines the number of token locations, $k$ determines how many candidates are calculated per each location."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_name = \"llama-2-7b-chat-hf\"\n",
    "\n",
    "train_size = 100\n",
    "r = 5   # Number of token places to consider\n",
    "k = 25  # Number of candidate logit values to consider\n",
    "\n",
    "# Locations of the train data\n",
    "read_paths_train = [\"logits/qnli/train\",\n",
    "                    \"logits/alpaca/train\"\n",
    "                    \"logits/gcg/train\",\n",
    "                    \"logits/autodan/train\"]\n",
    "\n",
    "logit_values_train = []\n",
    "\n",
    "for path in read_paths_train:\n",
    "    logits = np.load(os.path.join(path, f\"{model_name}.npy\"))\n",
    "    probabilities = softmax(logits, axis=2)\n",
    "    values = - np.log(probabilities)\n",
    "    logit_values_train.append(values[:train_size,:r,:k].reshape(train_size, r * k))\n",
    "\n",
    "logit_values_train = np.array(logit_values_train)\n",
    "benign_indexes = [0,1]\n",
    "attack_indexes = [2,3]\n",
    "\n",
    "train_benign   = np.concatenate((logit_values_train[benign_indexes]))\n",
    "train_attack = np.concatenate((logit_values_train[attack_indexes]))\n",
    "            \n",
    "train_x = np.concatenate((train_benign,train_attack))\n",
    "train_y = np.concatenate((np.zeros(train_benign.shape[0]),np.ones(train_attack.shape[0])))    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. Train the classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train the classifier \n",
    "clf = SVC(kernel=\"rbf\",class_weight=\"balanced\")\n",
    "clf.fit(train_x, train_y) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. Load the test data\n",
    "\n",
    "Load the saved logit values and prepare the feature vector. $r$ and $k$ values should remain the same with training data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Locations of the test data\n",
    "read_paths_test = [\"logits/qnli/test\",\n",
    "                    \"logits/alpaca/test\"\n",
    "                    \"logits/gcg/test\",\n",
    "                    \"logits/autodan/test\"]\n",
    "test_size = 100\n",
    "\n",
    "logit_values_test = []\n",
    "\n",
    "for path in range(len(read_paths_test)):\n",
    "    logits = np.load(os.path.join(path, f\"{model_name}.npy\"))\n",
    "    probabilities = softmax(logits, axis=2)\n",
    "    values = - np.log(probabilities)\n",
    "    logit_values_test.append(values[:test_size,:r,:k].reshape(test_size, r * k))\n",
    "\n",
    "logit_values_test = np.array(logit_values_test)\n",
    "benign_indexes = [0,1]\n",
    "attack_indexes = [2,3]\n",
    "\n",
    "test_benign   = np.concatenate((logit_values_test[benign_indexes]))\n",
    "test_attack = np.concatenate((logit_values_test[attack_indexes]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5. Test the classifier\n",
    "\n",
    "Get the TP and FP rates of the test data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_benign = clf.predict(test_benign)        \n",
    "pred_attack = clf.predict(test_attack)        \n",
    "\n",
    "print('TP rate:', sum(pred_attack)/len(pred_attack) * 100)\n",
    "print('FP rate:', sum(pred_benign)/len(pred_benign) * 100)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
