{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from scipy.stats import wasserstein_distance\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import random\n",
    "import string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "bias_types = ['acquiescence-50','response_order-50','odd_even-50', 'opinion_float-50','negative_wording']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "#exclude tokens that contain numbers\n",
    "\n",
    "PROB_PERMUTE = 0.2\n",
    "\n",
    "def generate_letter_swap(question):\n",
    "    \n",
    "    question = question.splitlines()\n",
    "    sentence = question[0]\n",
    "    \n",
    "    lst = sentence.split()\n",
    "    for i in range(len(lst)):\n",
    "        if len(lst[i])>4 and not lst[i].isnumeric(): \n",
    "            word = lst[i]\n",
    "            idx = random.randint(1, len(lst[i])-3)\n",
    "            #print(word, idx)\n",
    "            char_list = list(lst[i])  \n",
    "            char_list[idx], char_list[idx+1] = char_list[idx+1], char_list[idx]          \n",
    "            lst[i] = \"\".join(char_list)\n",
    "    new_sentence = \" \".join(lst) \n",
    "    question[0] = new_sentence\n",
    "    \n",
    "    return \"\\n\".join(question)\n",
    "    \n",
    "def generate_middle_random(question):\n",
    "    \n",
    "    question = question.splitlines()\n",
    "    sentence = question[0]\n",
    "    \n",
    "    lst = sentence.split()\n",
    "    for i in range(len(lst)):\n",
    "        if len(lst[i])>4 and not lst[i].isnumeric() and len(set(string.punctuation).intersection(set(lst[i]))) ==0: #random.random() < PROB_PERMUTE and\n",
    "            word = list(lst[i])\n",
    "            word_subset = word[1:len(word)-1]\n",
    "            random.shuffle(word_subset)\n",
    "            temp = \"\".join(word_subset)\n",
    "            lst[i] = word[0]+temp+word[len(word)-1]\n",
    "            \n",
    "    new_sentence = \" \".join(lst) \n",
    "    question[0] = new_sentence\n",
    "    \n",
    "    return \"\\n\".join(question)\n",
    "\n",
    "    \n",
    "def generate_key_typo(question):\n",
    "    \n",
    "    question = question.splitlines()\n",
    "    sentence = question[0]\n",
    "\n",
    "    lower_alpha='abcdefghijklmnopqrstuvwxyz'\n",
    "    lst = sentence.split()\n",
    "    for i in range(len(lst)):\n",
    "        if random.random() < PROB_PERMUTE and not lst[i].isnumeric():\n",
    "            word = list(lst[i])\n",
    "            idx_replace = random.randint(0, len(lst[i])-1)\n",
    "            idx_char = random.randint(0,26-1)\n",
    "            word[idx_replace] = lower_alpha[idx_char]\n",
    "            lst[i] = \"\".join(word)\n",
    "            \n",
    "    new_sentence = \" \".join(lst) \n",
    "    question[0] = new_sentence\n",
    "    \n",
    "    return \"\\n\".join(question)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "acquiescence-50\n",
      "response_order-50\n",
      "odd_even-50\n",
      "opinion_float-50\n",
      "negative_wording\n"
     ]
    }
   ],
   "source": [
    "for bias in bias_types:\n",
    "    \n",
    "    print(bias)\n",
    "    \n",
    "    df = pd.read_csv('data/pew_prompts/'+bias+'.csv')  \n",
    "    \n",
    "    col_names = df.columns\n",
    "    \n",
    "    if 'odd_even' not in bias:\n",
    "        other_name = None\n",
    "        for col in col_names:\n",
    "            if 'alpha' in col and col != 'orig alpha':\n",
    "                other_name = col\n",
    "\n",
    "        assert other_name != None\n",
    "        \n",
    "        qs = list(df['orig alpha'])\n",
    "    else:\n",
    "        other_name = 'no middle alpha'\n",
    "        qs = list(df['middle alpha'])\n",
    "    \n",
    "    if 'odd_even' in bias or 'opinion_float' in bias:\n",
    "        df['num options new'] = df['num options']\n",
    "    \n",
    "    #df[other_name] = [generate_letter_swap(sentence) for sentence in qs]\n",
    "    #df.to_csv('data/pew_prompts/'+bias+'-letter_swap.csv')\n",
    "    \n",
    "    df[other_name] = [generate_middle_random(sentence) for sentence in qs]\n",
    "    df.to_csv('data/pew_prompts/'+bias+'-middle_random.csv')\n",
    "    \n",
    "    #df[other_name] = [generate_key_typo(sentence) for sentence in qs]\n",
    "    #df.to_csv('data/pew_prompts/'+bias+'-key_typo.csv')\n",
    "        "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
