import numpy as np
import os


def count_tokens_in_bin_file(file_path):
    # Ensure the file exists
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return 0

    try:
        # Create a memory-mapped array from the binary file
        memmap_array = np.memmap(file_path, dtype=np.uint16, mode='r')

        # Count the number of tokens
        # Since each entry in the array is a token, the total number is just the length of the array
        token_count = len(memmap_array)

        return token_count
    except Exception as e:
        print(f"Error while reading the file {file_path}: {e}")
        return 0


# Path to the train.bin file
train_bin_path = '/scratch/07946/ss95332/data/openwebtext_9m/val.bin'  # Replace with the actual path to your train.bin file

# Counting the tokens
total_tokens = count_tokens_in_bin_file(train_bin_path)
print(f"Total number of tokens in {train_bin_path}: {total_tokens}")
