{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import scipy.stats as stats\n",
    "from scipy.stats import wasserstein_distance\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import csv\n",
    "from collections import defaultdict\n",
    "from scipy.stats import ttest_1samp\n",
    "from statistics import mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_groups(bias_type):\n",
    "\n",
    "    if \"acquiescence\" in bias_type: #note that this should be in the other direction\n",
    "        first_group = \"orig alpha\"\n",
    "        second_group = \"pos alpha\"\n",
    "        first_options=['a']\n",
    "        second_options = first_options\n",
    "    elif \"response_order\" in bias_type:\n",
    "        first_group = \"orig alpha\"\n",
    "        second_group = \"reversed alpha\"\n",
    "        first_options=['a']\n",
    "        second_options = first_options\n",
    "    elif \"odd_even\" in bias_type: #note that this should be in the other direction\n",
    "        second_group = \"no middle alpha\" \n",
    "        first_group = \"middle alpha\"\n",
    "        first_options =['b','d']\n",
    "        second_options = first_options\n",
    "    elif \"opinion_float\" in bias_type:\n",
    "        first_group = \"orig alpha\"\n",
    "        second_group = \"float alpha\"\n",
    "        first_options=['c']\n",
    "        second_options = first_options\n",
    "    elif \"negative_wording\" in bias_type: \n",
    "        first_group = \"orig alpha\"\n",
    "        second_group = \"forbid alpha\"\n",
    "        first_options=['a']\n",
    "        second_options=['b']\n",
    "    else:\n",
    "        raise ValueError(f\"Invalid bias type: {bias_type}\")\n",
    "        \n",
    "    assert len(first_options) == len(second_options)\n",
    "        \n",
    "    return first_group, second_group, first_options, second_options"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_stat_test(model, bias_type, use_scores=False, old_scores=None):\n",
    "    \n",
    "    root = 'results/'+model+'/csv'\n",
    "    \n",
    "    if 'key_typo' in bias_type or 'middle_random' in bias_type or 'letter_swap' in bias_type:\n",
    "        file = bias_type+'.csv' \n",
    "    elif model == 'llama2-7b' or model == 'llama2-13b' or model=='llama2-70b' or model =='gpt-3.5-turbo-instruct'\\\n",
    "    or 'ext_gen' in model:\n",
    "        file = bias_type+'.csv'\n",
    "    else:\n",
    "        file = bias_type+'-sample.csv'\n",
    "    \n",
    "\n",
    "    scores = {}\n",
    "    \n",
    "    with open(os.path.join(root, file), newline='') as csvfile:\n",
    "        reader = csv.DictReader(csvfile)\n",
    "        first_group, second_group, first_options, second_options = get_groups(file)\n",
    "        for row in reader:\n",
    "            \n",
    "            if not use_scores or (use_scores and row[\"key\"] in old_scores):\n",
    "                \n",
    "                if row[\"key\"] not in scores:\n",
    "                    scores[row[\"key\"]] = 0\n",
    "\n",
    "                if row[\"group\"] == first_group and row[\"response\"] in first_options:\n",
    "                    scores[row[\"key\"]] += 1\n",
    "                if row[\"group\"] == second_group and row[\"response\"] in second_options:\n",
    "                    scores[row[\"key\"]] += -1\n",
    "        \n",
    "    return scores, all_keys\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "models = ['gpt-3.5-turbo', 'gpt-3.5-turbo-instruct', 'llama2-7b', 'llama2-13b']\n",
    "bias_types = ['acquiescence','response_order', 'odd_even', 'opinion_float', 'negative_wording']\n",
    "subset_bias_types = ['acquiescence-50','response_order-50', 'odd_even-50', 'opinion_float-50', 'negative_wording']\n",
    "\n",
    "\n",
    "all_results = []\n",
    "\n",
    "for bias_type in bias_types:\n",
    "    for model in models:\n",
    "                \n",
    "        temp_bias_type = bias_type\n",
    "        if model == 'llama2-7b' or model == 'llama2-13b':\n",
    "            if bias_type != \"negative_wording\":\n",
    "                temp_bias_type = temp_bias_type+\"-50\"\n",
    "        new_scores, all_keys = run_stat_test(model+\"-ext_gen\", temp_bias_type) # get all of the keys that are used..\n",
    "        old_scores, all_keys1 = run_stat_test(model, bias_type, True, new_scores.keys())\n",
    "        \n",
    "        new_values = list(new_scores.values())\n",
    "        new_val_mean = mean(new_values)/50*100\n",
    "        new_p_value = ttest_1samp(new_values, 0)[1]\n",
    "        \n",
    "        old_values = list(old_scores.values())\n",
    "        old_val_mean = mean(old_values)/50*100\n",
    "        old_p_value = ttest_1samp(old_values, 0)[1]\n",
    "        \n",
    "        all_results.append([bias_type, model, old_val_mean, old_p_value, new_val_mean, new_p_value])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comb_df = pd.DataFrame(all_results, columns = ['bias type', 'model', 'old effect', \"old p value\", 'new effect', 'new p value'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comb_df.to_csv(\"ext_gen_results.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.read_csv(\"ext_gen_results.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comb_df.round(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(comb_df.to_latex(index=False, float_format=\"{:.4f}\".format))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comb_df[\"diff\"] = comb_df[\"old effect\"] - comb_df[\"new effect\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comb_df[\"diff\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.scatter(comb_df['old effect'], comb_df['new effect'])\n",
    "plt.xlabel(\"Original Delta\")\n",
    "plt.ylabel(\"Ext gen Delta\")\n",
    "plt.savefig(\"effect_correlation_extgen.pdf\", format=\"pdf\", bbox_inches=\"tight\")\n",
    "from scipy.stats import pearsonr\n",
    "\n",
    "corr, _ = pearsonr(comb_df['old effect'], comb_df['new effect'])\n",
    "print(corr)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.scatter(comb_df['old p value'], comb_df['new p value'])\n",
    "plt.xlabel(\"Original p value\")\n",
    "plt.ylabel(\"Ext gen p value\")\n",
    "plt.savefig(\"pval_correlation_extgen.pdf\", format=\"pdf\", bbox_inches=\"tight\")\n",
    "\n",
    "from scipy.stats import pearsonr\n",
    "\n",
    "corr, _ = pearsonr(comb_df['old p value'], comb_df['new p value'])\n",
    "print(corr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
