import logging

from MatLM import utils
from MatLM.data import Binarizer, DatasetBinarizer
from MatLM.option import PreprocessArg, parse_preprocess_args


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def main(args: PreprocessArg):
    tokenizer = utils.get_tokenizer(tokenizer=args.tokenizer, token=args.auth)

    binarizer = Binarizer(
        tokenizer=tokenizer,
        append_eos=args.append_eos,
        already_numberized=args.numberized,
    )

    DatasetBinarizer.binarize_dataset_with_multiprocess(
        binarizer=binarizer,
        data_dir=args.data_dir,
        prefix_name=args.prefix_name,
        dest_dir=args.dest_dir,
        save_name=args.save_name,
        chunk_load=args.chunk_load,
        num_worker=args.worker
    )
    logging.info("Building Dataset successfully.")


if __name__ == '__main__':
    args = parse_preprocess_args()
    main(args)