import sys
import os
import struct

# 添加 python path 确保能 import megatron
sys.path.append("/root/gpt-neox")

try:
    from megatron.data.indexed_dataset import MMapIndexedDataset
except ImportError:
    print("❌ 无法导入 MMapIndexedDataset，请检查路径")
    exit(1)

FILE_PATH = "/root/trainbin1/document2" # 不带后缀

def debug_dataset_loading():
    print(f"🕵️‍♀️ 尝试加载 {FILE_PATH} ...")
    
    # 获取类中定义的 Magic
    expected_magic = MMapIndexedDataset._HDR_MAGIC
    print(f"✅ 代码期望的 Magic Hex: {expected_magic.hex()}")
    
    # 尝试手动读取文件头进行对比
    with open(FILE_PATH + ".idx", "rb") as f:
        file_magic = f.read(8)
        file_ver_bytes = f.read(8)
        file_dtype = f.read(1)
        
    print(f"📄 文件实际的 Magic Hex: {file_magic.hex()}")
    print(f"📄 文件实际的 Version Hex: {file_ver_bytes.hex()}")
    
    if file_magic != expected_magic:
        print("❌ 致命错误: Magic 不匹配！")
    else:
        print("✅ Magic 匹配。")
        
    # 尝试实例化，捕获具体错误
    try:
        ds = MMapIndexedDataset(FILE_PATH, skip_warmup=True)
        print("🎉 成功实例化 Dataset！没有报错。")
        print(f"   Dtype: {ds._dtype}")
        print(f"   Total Size: {len(ds)}")
    except AssertionError as e:
        print(f"❌ 实例化失败 (AssertionError): {e}")
    except Exception as e:
        print(f"❌ 实例化失败 (Exception): {e}")

if __name__ == "__main__":
    debug_dataset_loading()
