from tqdm import tqdm
import torch
from datasets import load_dataset, concatenate_datasets, load_from_disk
import numpy as np
from transformers import AutoTokenizer
import datasets
import argparse
import os
import json
from huggingface_hub import delete_repo

parser = argparse.ArgumentParser()

parser.add_argument("--dataset", type=str)
parser.add_argument("--output", type=str)


args = parser.parse_args()

dataset_dir = args.dataset

output_dir = args.output


if __name__ == "__main__":
    full_dataset = load_from_disk("./"+dataset_dir)
    mini_dataset = full_dataset.select(range(500))
    mini_dataset.save_to_disk("./"+output_dir)
