# import os
# import glob
# from tqdm import tqdm
#
# # Replace 'your_directory_path' with the path to the directory containing your .bin files
# directory_path = '/scratch/07946/ss95332/data/lit-redpajama'
# total_tokens = total_tokens_in_directory(directory_path)
# print(f"Total number of tokens: {total_tokens}")

import os
import glob
import numpy as np
from pathlib import Path


def count_tokens_in_bin_files(source_path: Path, file_pattern: str = "*.bin") -> int:
    """Count the number of tokens in .bin files in a given directory."""
    total_token_count = 0

    # List all .bin files in the directory
    filenames = glob.glob(os.path.join(source_path, file_pattern), recursive=True)

    if not filenames:
        raise RuntimeError(f"No .bin files found in {source_path}.")

    for filename in filenames:
        print(f"Processing {filename}")

        # Load the tokenized data from the .bin file
        tokenized_data = np.load(filename, allow_pickle=True)

        # Assuming the tokenized data is a list of tokens, count them
        total_token_count += len(tokenized_data)

    return total_token_count


# Usage example
source_path = Path("/scratch/07946/ss95332/data/lit-redpajama")  # Replace with your actual path
total_tokens = count_tokens_in_bin_files(source_path)
print(f"Total number of tokens: {total_tokens}")

