{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from scipy.stats import ttest_rel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "model: Llama-2-7b-hf, dataset: jailbreak_llms, mean cost: -10.4712939453125\n",
      "model: Mistral-7B-v0.1, dataset: jailbreak_llms, mean cost: -8.19462890625\n",
      "model: gemma-7b, dataset: jailbreak_llms, mean cost: -10.62402587890625\n",
      "model: Llama-2-7b-hf, dataset: beavertails, mean cost: -11.8398046875\n",
      "model: Mistral-7B-v0.1, dataset: beavertails, mean cost: -13.506953125\n",
      "model: gemma-7b, dataset: beavertails, mean cost: -13.57283203125\n",
      "model: Llama-2-7b-hf, dataset: harmbench, mean cost: -10.92185546875\n",
      "model: Mistral-7B-v0.1, dataset: harmbench, mean cost: -6.1049267578125\n",
      "model: gemma-7b, dataset: harmbench, mean cost: -11.84928955078125\n",
      "model: Llama-2-7b-hf, dataset: red_team_attempts, mean cost: -11.7631640625\n",
      "model: Mistral-7B-v0.1, dataset: red_team_attempts, mean cost: -13.4420703125\n",
      "model: gemma-7b, dataset: red_team_attempts, mean cost: -14.0812890625\n"
     ]
    }
   ],
   "source": [
    "models = ['Llama-2-7b-hf', 'Mistral-7B-v0.1', 'gemma-7b']\n",
    "datasets = ['jailbreak_llms', 'beavertails', 'harmbench', 'red_team_attempts']\n",
    "for dataset in datasets:\n",
    "    for model in models:\n",
    "        file_path = f'results/safety_eval/{dataset}/{model}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff/vanilla_table.csv'\n",
    "        df = pd.read_csv(file_path)\n",
    "        print(f\"model: {model}, dataset: {dataset}, mean cost: {df['Cost/Reward'].mean()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "models = ['Llama-2-7b-hf']\n",
    "model_name_map = {'Llama-2-7b-hf': 'Llama2'}\n",
    "pefts = ['', '_sharegpt_ia3_ff_1']\n",
    "# titiles = ['Patch Base with DPO', 'Patch SFT with DPO']\n",
    "data_sets = ['jailbreak_llms', 'beavertails', 'harmbench', 'red_team_attempts']\n",
    "neuron_indexes = [\n",
    "    'difference_on_hh_prompt_last_token',\n",
    "    'std_on_hh_prompt_last_token',\n",
    "]\n",
    "labels = [\n",
    "    'Prompt Difference',\n",
    "    'Activation Variance'\n",
    "]\n",
    "dfs = []\n",
    "folder_template = '../results/safety_eval/{}/{}{}_vs_{}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff'\n",
    "for dataset in data_sets:\n",
    "    for model in models:\n",
    "        for peft in pefts:\n",
    "            for index, label in zip(neuron_indexes, labels):\n",
    "                file_path = folder_template.format(dataset, model, peft, model) + f'/guided_by_{model}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff_idx_{model}_{index}.csv'\n",
    "                df = pd.read_csv(file_path)\n",
    "                new_df = pd.DataFrame({\n",
    "                    'cost': df.iloc[:, 3],\n",
    "                    'Model': model_name_map[model],\n",
    "                    'Patched Model': 'SFT' if peft else 'Base',\n",
    "                    'Method': label,\n",
    "                    'Dataset': dataset\n",
    "                })\n",
    "                dfs.append(new_df)\n",
    "\n",
    "df = pd.concat(dfs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cost</th>\n",
       "      <th>Model</th>\n",
       "      <th>Patched Model</th>\n",
       "      <th>Method</th>\n",
       "      <th>Dataset</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-4.366733</td>\n",
       "      <td>Llama2</td>\n",
       "      <td>SFT</td>\n",
       "      <td>Prompt Difference</td>\n",
       "      <td>jailbreak_llms</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-3.707187</td>\n",
       "      <td>Llama2</td>\n",
       "      <td>SFT</td>\n",
       "      <td>Activation Variance</td>\n",
       "      <td>jailbreak_llms</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       cost   Model Patched Model               Method         Dataset\n",
       "0 -4.366733  Llama2           SFT    Prompt Difference  jailbreak_llms\n",
       "0 -3.707187  Llama2           SFT  Activation Variance  jailbreak_llms"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[(df['Patched Model']=='SFT') & (df['Dataset'] == data_sets[0])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Llama-2-7b-hf 0.03877784942626909\n",
      "Llama-2-7b-hf_sharegpt_ia3_ff_1 0.08075275889265345\n",
      "Mistral-7B-v0.1 0.06518549372083222\n",
      "Mistral-7B-v0.1_sharegpt_ia3_ff_1 0.4855730736508147\n",
      "gemma-7b 0.07574285702013972\n",
      "gemma-7b_sharegpt_ia3_ff_1 0.10123429399487287\n"
     ]
    }
   ],
   "source": [
    "models = ['Llama-2-7b-hf', 'Mistral-7B-v0.1', 'gemma-7b']\n",
    "model_name_map = {'Llama-2-7b-hf': 'Llama2', 'Mistral-7B-v0.1': 'Mistral', 'gemma-7b': 'Gemma'}\n",
    "pefts = ['', '_sharegpt_ia3_ff_1']\n",
    "data_sets = [\n",
    "    'hh_harmless',\n",
    "]\n",
    "neuron_indexes = [\n",
    "    'sft_vs_dpo_on_hh_harmless_sft_completion',\n",
    "]\n",
    "\n",
    "pvalue = []\n",
    "folder_template = '../results/arena/{}{}_vs_{}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff'\n",
    "for model in models:\n",
    "    for peft in pefts:\n",
    "        for index in neuron_indexes:\n",
    "            file_path = folder_template.format(model, peft, model) + f'/top20000_guided_by_{model}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff_idx_{model}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff_{index}_table.csv'\n",
    "            df = pd.read_csv(file_path)\n",
    "            safety_scores = df.iloc[:10, -1]\n",
    "            file_path = folder_template.format(model, peft, model) + f'/top20000_guided_by_{model}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff_idx_{model}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff_{index}_random_neurons_table.csv'\n",
    "            df = pd.read_csv(file_path)\n",
    "            random_scores = df.iloc[:10, -1]\n",
    "            result = ttest_rel(safety_scores, random_scores)\n",
    "            print(f'{model}{peft}', result.pvalue)\n",
    "            pvalue.append(result.pvalue)\n",
    "            \n",
    "\n",
    "\n",
    "# df = pd.concat(dfs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1.6654461777227642e-18,\n",
       " 8.752397395988581e-12,\n",
       " 2.2781450657061972e-15,\n",
       " 1.289224957006062e-09,\n",
       " 7.720960875414595e-14,\n",
       " 1.1494755332623734e-06]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pvalue "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.13 ('alignment')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "fa5059266a314ab8b54f0ed734c52e1c70437da747820dac7ce3245ce71fcc13"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
