{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading readme: 100%|██████████| 5.82k/5.82k [00:00<00:00, 3.35MB/s]\n",
      "Using custom data configuration akoksal--LongForm-e912efe86f9a3c91\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset None/None to /data/tianle/.cache/huggingface/datasets/akoksal___parquet/akoksal--LongForm-e912efe86f9a3c91/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading data: 100%|██████████| 38.3M/38.3M [00:00<00:00, 74.6MB/s]\n",
      "Downloading data: 100%|██████████| 3.61M/3.61M [00:00<00:00, 27.6MB/s]\n",
      "Downloading data: 100%|██████████| 3.66M/3.66M [00:00<00:00, 38.2MB/s]\n",
      "Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.33it/s]\n",
      "Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2066.84it/s]\n",
      "                                                                                       \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset parquet downloaded and prepared to /data/tianle/.cache/huggingface/datasets/akoksal___parquet/akoksal--LongForm-e912efe86f9a3c91/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 3/3 [00:00<00:00, 435.29it/s]\n"
     ]
    }
   ],
   "source": [
    "dataset = load_dataset(\"akoksal/LongForm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['input', 'output', 'source', 'subset'],\n",
       "        num_rows: 23652\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['input', 'output', 'source', 'subset'],\n",
       "        num_rows: 2045\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['input', 'output', 'source', 'subset'],\n",
       "        num_rows: 2042\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "uniform_len = [10, 20, 30, 40, 50, 60, 70, 80]\n",
    "non_uniform_len = [16, 22, 28, 34, 56, 62, 68, 74]\n",
    "ranking_dataset = []\n",
    "num_ans = 8\n",
    "for data in dataset[\"train\"]:\n",
    "    answer = data[\"output\"].split(\" \")\n",
    "    if len(answer) < 100:\n",
    "        continue\n",
    "    ranking_data = {}\n",
    "    uniform = (random.random() > 0.5)\n",
    "    ranking_data[\"uniform\"] = uniform\n",
    "    ranking_data[\"question\"] = data[\"input\"] + f\" Write the answer in an open-ended way.\" if uniform else data[\"input\"] + f\" Write either a short answer or a long answer.\"\n",
    "    ranking_data[\"answers\"] = []\n",
    "    if uniform:\n",
    "        for i in range(num_ans):\n",
    "            ranking_data[\"answers\"].append(\" \".join(answer[:uniform_len[i]]))\n",
    "    else:\n",
    "        for i in range(num_ans):\n",
    "            ranking_data[\"answers\"].append(\" \".join(answer[:non_uniform_len[i]]))\n",
    "    ranking_data[\"pairs\"] = []\n",
    "    for i in range(num_ans):\n",
    "        for j in range(i + 1, num_ans):\n",
    "            ranking_data[\"pairs\"].append([ranking_data[\"answers\"][j], ranking_data[\"answers\"][i]])\n",
    "    ranking_dataset.append(ranking_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "15025"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ranking_dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "torch.save(ranking_dataset, \"ranking_dataset_longform.pt\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "openassistant",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
