import argparse
import pprint
from pado.data.datasets.speech.librispeech import PadoLibriSpeech
from pado.core import set_logger

logger = set_logger("")


def run(args):
    pprint.pprint(args)

    for mode in ("train-clean-100", "train-clean-360", "train-other-500",
                 "dev-clean", "dev-other", "test-clean", "test-other"):
        dataset = PadoLibriSpeech(args.data_dir,
                                  mode=mode,
                                  clean_script=args.clean_script,
                                  script_only=True)

        logger.info(f"LibriSpeech {mode}: {len(dataset)} samples.")


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir", type=str, help="LibriSpeech dataset path")
    parser.add_argument("--clean_script", action="store_true", help="Clean text script (default: F)")
    cfg = parser.parse_args()

    run(cfg)
