# gpt-neox/check_document1_idx.py
# 只检查 /root/trainbin1/document1.{idx,bin} 是否匹配；不依赖仓库代码

import os, struct, numpy as np

PREFIX = "/root/trainbin1/document1"  # 你要检查的前缀
idx_path, bin_path = PREFIX + ".idx", PREFIX + ".bin"

print("idx:", idx_path)
print("bin:", bin_path)

# 读取 idx 头和数组
with open(idx_path, "rb") as f:
    magic = f.read(9)
    version = struct.unpack("<Q", f.read(8))[0]
    dtype_code = struct.unpack("<B", f.read(1))[0]
    sizes_len = struct.unpack("<Q", f.read(8))[0]
    doc_len = struct.unpack("<Q", f.read(8))[0]
    sizes = np.fromfile(f, dtype=np.uint32, count=sizes_len)
    pointers = np.fromfile(f, dtype=np.uint64, count=sizes_len)

print(f"header: version={version} dtype_code={dtype_code} sizes_len={sizes_len} doc_idx_len={doc_len}")
print(f"sizes[:5]={sizes[:5]} pointers[:5]={pointers[:5]}")
print(f"sizes[-5:]={sizes[-5:]} pointers[-5:]={pointers[-5:]}")

# 基本一致性
assert len(sizes) == len(pointers), "sizes/pointers 长度不一致"
mono = bool(np.all(pointers[:-1] <= pointers[1:]))
print("pointers 非降序:", mono)

# bin 大小（字节）
bin_bytes = os.path.getsize(bin_path)
print("bin size (bytes):", bin_bytes)

# 不做假设：尝试常见的 itemsize，看看哪种可行
for itemsize in (1, 2, 4, 8):
    max_tokens = bin_bytes // itemsize
    last_end = int(pointers[-1] + sizes[-1]) if len(sizes) else 0
    ok = last_end <= max_tokens
    print(f"[itemsize={itemsize}] max_tokens={max_tokens} last_end={last_end} within={ok}")



