{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/jaehyung/anaconda3/envs/human/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "In Transformers v4.0.0, the default path to cache downloaded models changed from '~/.cache/torch/transformers' to '~/.cache/huggingface/transformers'. Since you don't seem to have overridden and '~/.cache/torch/transformers' is a directory that exists, we're moving it to '~/.cache/huggingface/transformers' to avoid redownloading models you have already in the cache. You should only see this message once.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import torch\n",
    "import numpy as np\n",
    "import time \n",
    "import torch.nn.functional as F\n",
    "import easydict\n",
    "import pickle\n",
    "from torch.utils.data import TensorDataset, DataLoader\n",
    "from tqdm import tqdm\n",
    "from datasets import load_dataset\n",
    "import json\n",
    "\n",
    "from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM, RobertaForMaskedLM\n",
    "from transformers import RobertaTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "args = easydict.EasyDict({\"mode\" : 'train'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extract Preference with Existing Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Reusing dataset dynabench_dyna_sent (/home/jaehyung/.cache/huggingface/datasets/dynabench___dynabench_dyna_sent/dynabench.dynasent.r2.all/1.1.0/ab89971d9ae1aacc59ed44d6855bf0e89167417257e2c2666f38e532148f2967)\n",
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 189.83it/s]\n"
     ]
    }
   ],
   "source": [
    "r1_dataset = load_dataset(\"dynabench/dynasent\", \"dynabench.dynasent.r2.all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_sent = r1_dataset['train']['sentence']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_samples = len(r1_dataset['train'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_labels = r1_dataset['train']['label_distribution']\n",
    "n_sample = len(all_labels)\n",
    "all_annotation = torch.zeros(n_sample, 4)\n",
    "label_str = ['negative', 'positive', 'neutral', 'mixed']\n",
    "\n",
    "for i in range(n_sample):\n",
    "    for j in range(len(label_str)):\n",
    "        all_annotation[i, j] = len(all_labels[i][label_str[j]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "divide_anno = torch.zeros(5, n_samples).long()\n",
    "for i in range(n_sample):\n",
    "    idx = 0\n",
    "    for j in range(4):\n",
    "        for k in range(int(all_annotation[i, j])):\n",
    "            if j == 3:\n",
    "                selected_cls = torch.randint(0, 2, (1,))\n",
    "                divide_anno[idx, i] = selected_cls\n",
    "            else:\n",
    "                divide_anno[idx, i] = j\n",
    "            idx += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save('./pre_gen/dynasent2_all_anno.npy', divide_anno[torch.randperm(5)])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Hard Labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_samples = len(r1_dataset[args.mode]['gold_label'])\n",
    "hard_labels = []\n",
    "gold_labels = r1_dataset[args.mode]['gold_label']\n",
    "for i in range(n_samples):\n",
    "    if gold_labels[i] == 'negative':\n",
    "        gold_label = 0\n",
    "    elif gold_labels[i] == 'positive':\n",
    "        gold_label = 1\n",
    "    else:\n",
    "        gold_label = 2\n",
    "    hard_labels.append(gold_label)\n",
    "hard_labels_t = torch.LongTensor(hard_labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Soft Labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "soft_labels = torch.zeros(n_samples, 3)\n",
    "all_labels = r1_dataset[args.mode]['label_distribution']\n",
    "for i in range(n_samples):\n",
    "    soft_labels[i, 0] += len(all_labels[i]['negative'])\n",
    "    soft_labels[i, 1] += len(all_labels[i]['positive'])\n",
    "    soft_labels[i, 2] += len(all_labels[i]['neutral'])\n",
    "    \n",
    "    soft_labels[i, 0] += (0.5 * len(all_labels[i]['mixed']))\n",
    "    soft_labels[i, 1] += (0.5 * len(all_labels[i]['mixed']))\n",
    "soft_labels = soft_labels / soft_labels.sum(dim=-1, keepdim=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save('./pre_gen/dynasent2_soft_label.npy', soft_labels.numpy())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Human Preference (at least)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_19971/3234990758.py:3: UserWarning: This overload of nonzero is deprecated:\n",
      "\tnonzero()\n",
      "Consider using one of the following signatures instead:\n",
      "\tnonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)\n",
      "  indices_all[label_str[0]] = list((hard_labels_t == 0).float().nonzero()[:, 0].numpy())\n"
     ]
    }
   ],
   "source": [
    "label_str = ['negative', 'positive', 'neutral']\n",
    "indices_all = {}\n",
    "indices_all[label_str[0]] = list((hard_labels_t == 0).float().nonzero()[:, 0].numpy())\n",
    "indices_all[label_str[1]] = list((hard_labels_t == 1).float().nonzero()[:, 0].numpy())\n",
    "indices_all[label_str[2]] = list((hard_labels_t == 2).float().nonzero()[:, 0].numpy())\n",
    "prefers = soft_labels.max(dim=1)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "human_preference_all = {}\n",
    "for label in label_str:\n",
    "    pref = prefers[indices_all[label]]\n",
    "    mat = pref.unsqueeze(1) - pref.unsqueeze(0)\n",
    "    convert = 2 * (mat == 0) + 1 * (mat > 0) + torch.eye(len(indices_all[label]))\n",
    "    \n",
    "    human_preference_all[label] = mat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./pre_gen/dynasent2_indices.pkl', 'wb') as f:\n",
    "    pickle.dump(indices_all, f)\n",
    "    \n",
    "with open('./pre_gen/dynasent2_human_pref.pkl', 'wb') as f:\n",
    "    pickle.dump(human_preference_all, f)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "rand_idx_pref = torch.zeros(20, n_samples, 2).long()\n",
    "\n",
    "for i in range(n_samples):\n",
    "    candidate_idx = indices_all[gold_labels[i]]\n",
    "    i_loc = candidate_idx.index(i)\n",
    "\n",
    "    for k in range(20):\n",
    "        rand_idx = np.random.randint(0, len(candidate_idx))\n",
    "        while i_loc == rand_idx:\n",
    "            rand_idx = np.random.randint(0, len(candidate_idx))\n",
    "\n",
    "        rand_idx_pref[k, i, 0] = candidate_idx[rand_idx]\n",
    "        rand_idx_pref[k, i, 1] = human_preference_all[gold_labels[i]][i_loc, rand_idx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save('./pre_gen/dynasent2_idx_pref_random20.npy', rand_idx_pref.numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "human",
   "language": "python",
   "name": "human"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
