from microsoft_nlp import paths
from microsoft_nlp.data_to_records import TokenNpzToRecords

target_dir = paths.datasets / "owt2-tokenized-allrecords"

target_dir.mkdir()
# target_dir.mkdir(exist_ok=True)

# this should result in ~16 MB files
tokens_per_file = 1024 * 512 * 16
converter = TokenNpzToRecords(tokens_per_file=tokens_per_file)

converter.convert_all(target_dir=target_dir, kind=None)
