{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/opt/dlami/nvme/anonymous/codes/med-sipf\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "os.chdir(\"../..\")\n",
    "print(os.getcwd())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import dotenv\n",
    "\n",
    "dotenv.load_dotenv(override=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/dlami/nvme/anonymous/misc/miniconda3/envs/anonymous-med_sipf/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Creating parquet from Arrow format: 100%|██████████| 24/24 [00:00<00:00, 24.20ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.56s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/anonymous/m23k-tokenized/commit/234c6d01e3a130de2ca21f247adb30ec2c971325', commit_message='Upload dataset', commit_description='', oid='234c6d01e3a130de2ca21f247adb30ec2c971325', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/anonymous/m23k-tokenized', endpoint='https://huggingface.co', repo_type='dataset', repo_id='anonymous/m23k-tokenized'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "repo_id = \"anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval-tokenized-120325\"\n",
    "upload_repo_id = \"anonymous/m23k-tokenized\"\n",
    "split = \"train\"\n",
    "\n",
    "\n",
    "dataset = datasets.load_dataset(repo_id, split=split)\n",
    "\n",
    "kept_columns = ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string', 'reasoning', 'distilled_answer_string', 'domain_code', 'domain_name','text']\n",
    "removed_columns = [column for column in dataset.column_names if column not in kept_columns]\n",
    "dataset = dataset.remove_columns(removed_columns)\n",
    "\n",
    "dataset.push_to_hub(upload_repo_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 20.61ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.18it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/anonymous/m1k-tokenized/commit/77d9a528dcff2a541e89cdc42b854f8c98c5c5d8', commit_message='Upload dataset', commit_description='', oid='77d9a528dcff2a541e89cdc42b854f8c98c5c5d8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/anonymous/m1k-tokenized', endpoint='https://huggingface.co', repo_type='dataset', repo_id='anonymous/m1k-tokenized'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "repo_id = \"anonymous/m196k-dedup-decon-filter_easy-r1-filter_wrong-decon_eval-domain_source_1k-tokenized-150325\"\n",
    "upload_repo_id = \"anonymous/m1k-tokenized\"\n",
    "split = \"train\"\n",
    "\n",
    "\n",
    "dataset = datasets.load_dataset(repo_id, split=split)\n",
    "\n",
    "kept_columns = ['answer_idx', 'source', 'metadata', 'prompt', 'answer_letter', 'answer_string', 'reasoning', 'distilled_answer_string', 'domain_code', 'domain_name','text']\n",
    "removed_columns = [column for column in dataset.column_names if column not in kept_columns]\n",
    "dataset = dataset.remove_columns(removed_columns)\n",
    "\n",
    "dataset.push_to_hub(upload_repo_id)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "anonymous-med_sipf",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
