{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import scipy.stats as stats\n",
    "from scipy.stats import wasserstein_distance\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import csv\n",
    "from collections import defaultdict\n",
    "from scipy.stats import ttest_1samp\n",
    "from statistics import mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_groups(bias_type):\n",
    "\n",
    "    if \"acquiescence\" in bias_type: #note that this should be in the other direction\n",
    "        first_group = \"orig alpha\"\n",
    "        second_group = \"pos alpha\"\n",
    "        first_options=['a']\n",
    "        second_options = first_options\n",
    "    elif \"response_order\" in bias_type:\n",
    "        first_group = \"orig alpha\"\n",
    "        second_group = \"reversed alpha\"\n",
    "        first_options=['a']\n",
    "        second_options = first_options\n",
    "    elif \"odd_even\" in bias_type: #note that this should be in the other direction\n",
    "        second_group = \"no middle alpha\" \n",
    "        first_group = \"middle alpha\"\n",
    "        first_options =['b','d']\n",
    "        second_options = first_options\n",
    "    elif \"opinion_float\" in bias_type:\n",
    "        first_group = \"orig alpha\"\n",
    "        second_group = \"float alpha\"\n",
    "        first_options=['c']\n",
    "        second_options = first_options\n",
    "    elif \"negative_wording\" in bias_type: \n",
    "        first_group = \"orig alpha\"\n",
    "        second_group = \"forbid alpha\"\n",
    "        first_options=['a']\n",
    "        second_options=['b']\n",
    "    else:\n",
    "        raise ValueError(f\"Invalid bias type: {bias_type}\")\n",
    "        \n",
    "    assert len(first_options) == len(second_options)\n",
    "        \n",
    "    return first_group, second_group, first_options, second_options"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_stat_test(model, bias_type):\n",
    "    \n",
    "    root = 'results/'+model+'/csv'\n",
    "    \n",
    "    if 'key_typo' in bias_type or 'middle_random' in bias_type or 'letter_swap' in bias_type:\n",
    "        file = bias_type+'.csv' \n",
    "    elif model == 'llama2-70b-ift' or model == 'gpt-3.5-turbo':\n",
    "        file = bias_type+'-sample.csv'\n",
    "    else:\n",
    "        file = bias_type+'.csv'\n",
    "    \n",
    "    scores = defaultdict(lambda: 0)  \n",
    "    original = defaultdict(lambda:0)\n",
    "    \n",
    "    with open(os.path.join(root, file), newline='') as csvfile:\n",
    "        reader = csv.DictReader(csvfile)\n",
    "        first_group, second_group, first_options, second_options = get_groups(file)\n",
    "        for row in reader:\n",
    "\n",
    "            if row[\"group\"] == first_group and row[\"response\"] in first_options:\n",
    "                scores[row[\"key\"]] += 1\n",
    "                original[row[\"key\"]] += 1\n",
    "            if row[\"group\"] == second_group and row[\"response\"] in second_options:\n",
    "                scores[row[\"key\"]] += -1\n",
    "            \n",
    "    values = list(scores.values())\n",
    "    values = [value/50*100 for value in values]\n",
    "    \n",
    "    og_values = list(original.values())\n",
    "    og_values = [value/50*100 for value in og_values]\n",
    "    \n",
    "    p_value = ttest_1samp(values, 0)[1]\n",
    "    \n",
    "    return mean(og_values), mean(values), p_value\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_stat_test_steer(original_model, model, original_bias_type, bias_type):\n",
    "    \n",
    "    root = 'results/'+model+'/csv'\n",
    "    \n",
    "    if 'key_typo' in bias_type or 'middle_random' in bias_type or 'letter_swap' in bias_type:\n",
    "        file = bias_type+'.csv' \n",
    "    elif model == 'llama2-70b-ift' or model == 'gpt-3.5-turbo':\n",
    "        file = bias_type+'-sample.csv'\n",
    "    else:\n",
    "        file = bias_type+'.csv'\n",
    "    \n",
    "    scores = defaultdict(lambda: 0)  \n",
    "    sample_count = defaultdict(lambda: 0)\n",
    "    keys = []\n",
    "    \n",
    "    with open(os.path.join(root, file), newline='') as csvfile:\n",
    "        reader = csv.DictReader(csvfile)\n",
    "        first_group, second_group, first_options, second_options = get_groups(file)\n",
    "        for row in reader:\n",
    "            if row[\"group\"] == second_group and row[\"response\"] in second_options:\n",
    "                scores[row[\"key\"]] += -1\n",
    "            sample_count[row[\"key\"]] += 1\n",
    "            \n",
    "            keys.append(row[\"key\"])\n",
    "    \n",
    "    bias_type = original_bias_type\n",
    "    \n",
    "    root = 'results/'+original_model+'/csv'\n",
    "    if 'key_typo' in bias_type or 'middle_random' in bias_type or 'letter_swap' in bias_type:\n",
    "        file = bias_type+'.csv' \n",
    "    elif original_model == 'llama2-70b-ift' or original_model == 'gpt-3.5-turbo':\n",
    "        file = bias_type+'-sample.csv'\n",
    "    else:\n",
    "        file = bias_type+'.csv'\n",
    "            \n",
    "    keys = set(keys)\n",
    "    with open(os.path.join(root, file), newline='') as csvfile:\n",
    "        reader = csv.DictReader(csvfile)\n",
    "        first_group, second_group, first_options, second_options = get_groups(file)\n",
    "        for row in reader:\n",
    "            if row[\"group\"] == first_group and row[\"response\"] in first_options and\\\n",
    "            row[\"key\"] in keys and sample_count[row[\"key\"]]>0:\n",
    "                scores[row[\"key\"]] += 1\n",
    "                sample_count[row[\"key\"]] -= 1\n",
    "                \n",
    "    \n",
    "    values = list(scores.values())\n",
    "    values = [value/50*100 for value in values]\n",
    "    \n",
    "    p_value = ttest_1samp(values, 0)[1]\n",
    "    \n",
    "    return mean(values), p_value\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "models = ['gpt-3.5-turbo', 'gpt-3.5-turbo-instruct']\n",
    "bias_types = ['response_order','negative_wording']\n",
    "\n",
    "all_results = []\n",
    "for model in models:\n",
    "    for bias_type in bias_types:\n",
    "        \n",
    "        original, diff, p_value = run_stat_test(model, bias_type)\n",
    "        lst = [model, bias_type, round(diff,4), round(p_value,4)]\n",
    "\n",
    "        new_model = model+'-steer'\n",
    "        diff, p_value = run_stat_test_steer(model, new_model, bias_type, bias_type)\n",
    "        lst.append(round(diff,4))\n",
    "        lst.append(round(p_value,4))\n",
    "        all_results.append(lst)\n",
    "        \n",
    "        new_model = model+'-steer'\n",
    "        diff, p_value = run_stat_test_steer(model, new_model, bias_type, bias_type+\"-steer2\")\n",
    "        lst.append(round(diff,4))\n",
    "        lst.append(round(p_value,4))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comb_df = pd.DataFrame(all_results, columns = ['model','bias type', 'old effect', \"pval\", 'steer effect', 'steer pval', 'steer 2 effect', 'steeer 2 pval'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comb_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = 'gpt-3.5-turbo'\n",
    "bias_type = 'response_order'\n",
    "\n",
    "original, diff, p_value = run_stat_test(model, bias_type)\n",
    "print(\"Original effect size\", round(diff,4), \"p_value\", round(p_value,4))\n",
    "\n",
    "new_model = 'gpt-3.5-turbo-steer'\n",
    "new_bias_type = 'response_order'\n",
    "diff, p_value = run_stat_test_steer(model, new_model, bias_type, new_bias_type)\n",
    "print(\"Steered effect size\", round(diff,4), \"p_value\", round(p_value,4))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = 'gpt-3.5-turbo-instruct'\n",
    "bias_type = 'response_order'\n",
    "\n",
    "original, diff, p_value = run_stat_test(model, bias_type)\n",
    "print(\"Original effect size\", round(diff,4), \"p_value\", round(p_value,4))\n",
    "\n",
    "new_model = 'gpt-3.5-turbo-instruct-steer'\n",
    "new_bias_type = 'response_order'\n",
    "diff, p_value = run_stat_test_steer(model, new_model, bias_type, new_bias_type)\n",
    "print(\"Steered effect size\", round(diff,4), \"p_value\", round(p_value,4))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = 'gpt-3.5-turbo'\n",
    "bias_type = 'negative_wording'\n",
    "\n",
    "original, diff, p_value = run_stat_test(model, bias_type)\n",
    "print(\"Original effect size\", round(diff,4), \"p_value\", round(p_value,4))\n",
    "\n",
    "new_model = 'gpt-3.5-turbo-steer'\n",
    "new_bias_type = 'negative_wording'\n",
    "diff, p_value = run_stat_test_steer(model, new_model, bias_type, new_bias_type)\n",
    "print(\"Steered effect size\", round(diff,4), \"p_value\", round(p_value,4))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = 'gpt-3.5-turbo-instruct'\n",
    "bias_type = 'negative_wording'\n",
    "\n",
    "original, diff, p_value = run_stat_test(model, bias_type)\n",
    "print(\"Original effect size\", round(diff,4), \"p_value\", round(p_value,4))\n",
    "\n",
    "new_model = 'gpt-3.5-turbo-steer'\n",
    "new_bias_type = 'negative_wording'\n",
    "diff, p_value = run_stat_test_steer(model, new_model, bias_type, new_bias_type)\n",
    "print(\"Steered effect size\", round(diff,4), \"p_value\", round(p_value,4))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
