{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/data1/anonymous/codes/med-sipf\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "os.chdir(\"../..\")\n",
    "print(os.getcwd())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import dotenv\n",
    "\n",
    "dotenv.load_dotenv(override=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data1/anonymous/misc/miniconda3/envs/m1/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating train split: 100%|██████████| 23493/23493 [00:00<00:00, 32015.26 examples/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 24/24 [00:00<00:00, 27.29ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:08<00:00,  8.59s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/anonymous/m1k-random_23k/commit/b58f85c011f07ddd2d6f9cdfdee72ae16987330e', commit_message='Upload dataset', commit_description='', oid='b58f85c011f07ddd2d6f9cdfdee72ae16987330e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/anonymous/m1k-random_23k', endpoint='https://huggingface.co', repo_type='dataset', repo_id='anonymous/m1k-random_23k'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# random_23k\n",
    "repo_id = \"anonymous/m196k-random_23k-decon_eval-r1-filter_wrong-tokenized-280325\"\n",
    "upload_repo_id = \"anonymous/m1k-random_23k-tokenized\"\n",
    "split = \"train\"\n",
    "\n",
    "\n",
    "dataset = datasets.load_dataset(repo_id, split=split)\n",
    "\n",
    "kept_columns = ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string', 'reasoning', 'distilled_answer_string', 'domain_code', 'domain_name','text']\n",
    "removed_columns = [column for column in dataset.column_names if column not in kept_columns]\n",
    "dataset = dataset.remove_columns(removed_columns)\n",
    "\n",
    "dataset.push_to_hub(upload_repo_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 25613.29 examples/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 16.96ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/anonymous/m1k-random_1k/commit/368621a69b681b49dfd23b4c47c2bd824d3a2f86', commit_message='Upload dataset', commit_description='', oid='368621a69b681b49dfd23b4c47c2bd824d3a2f86', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/anonymous/m1k-random_1k', endpoint='https://huggingface.co', repo_type='dataset', repo_id='anonymous/m1k-random_1k'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# random_1k\n",
    "repo_id = \"anonymous/m196k-random_1k-decon_eval-r1-filter_wrong-tokenized-280325\"\n",
    "upload_repo_id = \"anonymous/m1k-random_1k-tokenized\"\n",
    "split = \"train\"\n",
    "\n",
    "\n",
    "dataset = datasets.load_dataset(repo_id, split=split)\n",
    "\n",
    "kept_columns = ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string', 'reasoning', 'distilled_answer_string', 'domain_code', 'domain_name','text']\n",
    "removed_columns = [column for column in dataset.column_names if column not in kept_columns]\n",
    "dataset = dataset.remove_columns(removed_columns)\n",
    "\n",
    "dataset.push_to_hub(upload_repo_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 15288.32 examples/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 11.31ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/anonymous/m1k-hard_random_1k/commit/a95229e960c3541e47dc710c05a720a2cc30a945', commit_message='Upload dataset', commit_description='', oid='a95229e960c3541e47dc710c05a720a2cc30a945', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/anonymous/m1k-hard_random_1k', endpoint='https://huggingface.co', repo_type='dataset', repo_id='anonymous/m1k-hard_random_1k'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# hard_random_1k\n",
    "repo_id = \"anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval-hard_random_1k-tokenized-280325\"\n",
    "upload_repo_id = \"anonymous/m1k-hard_random_1k-tokenized\"\n",
    "split = \"train\"\n",
    "\n",
    "\n",
    "dataset = datasets.load_dataset(repo_id, split=split)\n",
    "\n",
    "kept_columns = ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string', 'reasoning', 'distilled_answer_string', 'domain_code', 'domain_name','text']\n",
    "removed_columns = [column for column in dataset.column_names if column not in kept_columns]\n",
    "dataset = dataset.remove_columns(removed_columns)\n",
    "\n",
    "dataset.push_to_hub(upload_repo_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 12667.40 examples/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 12.76ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.31s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/anonymous/m1k-domain_1k/commit/dad91c920d648217fe031fc55b11338cced76651', commit_message='Upload dataset', commit_description='', oid='dad91c920d648217fe031fc55b11338cced76651', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/anonymous/m1k-domain_1k', endpoint='https://huggingface.co', repo_type='dataset', repo_id='anonymous/m1k-domain_1k'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# domain_1k\n",
    "repo_id = \"anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval-domain_1k-tokenized-280325\"\n",
    "upload_repo_id = \"anonymous/m1k-domain_1k-tokenized\"\n",
    "split = \"train\"\n",
    "\n",
    "\n",
    "dataset = datasets.load_dataset(repo_id, split=split)\n",
    "\n",
    "kept_columns = ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string', 'reasoning', 'distilled_answer_string', 'domain_code', 'domain_name','text']\n",
    "removed_columns = [column for column in dataset.column_names if column not in kept_columns]\n",
    "dataset = dataset.remove_columns(removed_columns)\n",
    "\n",
    "dataset.push_to_hub(upload_repo_id)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "m1",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
