from tqdm import tqdm
import torch
from datasets import load_dataset, concatenate_datasets, load_from_disk
import numpy as np
from transformers import AutoTokenizer
import datasets
import argparse
import os
import json
from huggingface_hub import delete_repo

parser = argparse.ArgumentParser()

parser.add_argument("--dataset", type=str)
parser.add_argument("--total", type=int)


args = parser.parse_args()

dataset_dir = args.dataset

total_part = args.total


if __name__ == "__main__":
    dataset_list = []
    for i in range(total_part):
        mini_dataset = load_from_disk("./datasets/"+dataset_dir+f"_mini_{i}")
        dataset_list.append(mini_dataset)
    train_dataset = concatenate_datasets(dataset_list)

    # train_dataset = load_from_disk(
    #     "./datasets/"+dataset_dir)

    train_dataset.push_to_hub(
        dataset_dir, split="train_prefs", private=False)
