# 读取 /root/trainbin1/document1.idx 得到样本数，
# 把 /root/trainbin1/mask_all_true.npy 覆盖到起点 1,024,000，
# 生成与数据等长的新掩码 /root/trainbin1/mask_aligned.npy

import os, struct, numpy as np

IDX = "/root/trainbin1/document1.idx"
ORIG_MASK = "/root/trainbin1/mask_all_true.npy"
OUT_MASK = "/root/trainbin1/mask_aligned.npy"
START = 1024000  # 你的 mask 起点

# 读 idx 头拿样本数
with open(IDX, "rb") as f:
    f.read(9)                     # magic
    f.read(8)                     # version
    f.read(1)                     # dtype_code
    sizes_len = struct.unpack("<Q", f.read(8))[0]  # 样本总数
    # 后面内容不用读了

n_samples = sizes_len
orig = np.load(ORIG_MASK)
print("orig_mask len:", len(orig), "dtype:", orig.dtype)
print("dataset samples:", n_samples)

# 生成对齐掩码：默认全 True
out = np.ones(n_samples, dtype=bool)

# 计算可覆盖区间（避免越界）
i0 = max(0, START)
i1 = min(START + len(orig), n_samples)
if i0 < i1:
    # 将原掩码复制到对应窗口
    # 非布尔类型也兼容：非零视为 True
    out[i0:i1] = orig[:(i1 - i0)].astype(bool)
else:
    print("warning: START 超过数据样本数，本次不会覆盖任何位置")

np.save(OUT_MASK, out)
print("wrote:", OUT_MASK)
print("out_mask len:", len(out), "true_count:", int(out.sum()))
print("covered window:", i0, "->", i1)
