# tools/build_order_insert_shift_random.py
# -*- coding: utf-8 -*-
import os, struct
import numpy as np
from pathlib import Path

# ===== 按需修改为你的路径/参数 =====
# 建议用原始字符串 r"...", 或者正斜杠路径
DATASET_SIZE = None                                   # 若不填，下面用 DATA_PREFIX 自动探测
DATA_PREFIX  =r""         # 不带 .idx/.bin 的前缀；DATASET_SIZE=None 时必填
ORDER_OUT    =r"\.npy"

STEP_SIZE    = 1024
START_STEP   = 900                                    # 插入起始 step
K_SAMPLES    = 12500                                # 随机抽样数量

RANGE_LO     = 1024000                            # 抽样下界（含）
RANGE_HI     = 2048000                             # 抽样上界（不含）
RAND_SEED    = 42                                # 随机种子，可改
# ===================================

def detect_size(prefix: str) -> int:
    idx = prefix + ".idx"
    with open(idx, "rb") as f:
        magic9 = f.read(9)
        if magic9 == b"MMIDIDX\x00\x00":
            _ = f.read(8)       # version
            _ = f.read(1)       # dtype_code
            n = struct.unpack("<Q", f.read(8))[0]
            return int(n)
        f.seek(0)
        if f.read(8) != b"TNTIDX\x00\x00":
            raise ValueError("unknown idx format")
        _ = struct.unpack("<Q", f.read(8))[0]
        f.read(16)  # code, element_size
        n = struct.unpack("<Q", f.read(8))[0]
        return int(n)

def main():
    # 1) 取得 N
    if DATASET_SIZE is None:
        if not DATA_PREFIX:
            raise ValueError("当 DATASET_SIZE=None 时，请设置 DATA_PREFIX 以从 .idx 探测 N")
        N = detect_size(DATA_PREFIX)
    else:
        N = int(DATASET_SIZE)
    print(f"[info] dataset size N = {N}")

    # 2) 计算插入位置
    start = START_STEP * STEP_SIZE
    if start > N:
        raise ValueError(f"START_STEP 太大：start={start} > N={N}")

    # 3) 准备随机抽样池（区间与 N 相交）
    lo = max(0, min(N, RANGE_LO))
    hi = max(0, min(N, RANGE_HI))
    if hi <= lo:
        raise ValueError(f"抽样区间为空：[{lo},{hi}) 与 N 不相交")
    pool = np.arange(lo, hi, dtype=np.int64)
    if K_SAMPLES > pool.size:
        raise ValueError(f"抽样数量 K={K_SAMPLES} 大于区间大小 {pool.size}")
    # 注：此区间位于 step>=1000，故抽到的索引都远大于 start（step=900）

    # 4) 随机抽样（不放回）
    rs = np.random.RandomState(RAND_SEED)
    selected = rs.choice(pool, size=K_SAMPLES, replace=False)
    # 可选：如想打乱插入顺序，保持 choice 的原随机顺序即可；如想排序，可取消注释下一行
    # selected = np.sort(selected)

    # 5) 构造新顺序： [0..start-1] + selected + [start..N-1]
    pre  = np.arange(start, dtype=np.int64)
    tail = np.arange(start, N, dtype=np.int64)
    order = np.concatenate([pre, selected, tail], axis=0)

    # 6) 基本校验
    ok_len  = (len(order) == N + K_SAMPLES)
    ok_blk  = np.array_equal(order[start:start+K_SAMPLES], selected)
    ok_tail = np.array_equal(order[start+K_SAMPLES:], np.arange(start, N, dtype=np.int64))
    print(f"[check] len={ok_len}, block={ok_blk}, shifted_tail={ok_tail}")

    # 7) 保存
    out = Path(ORDER_OUT)
    out.parent.mkdir(parents=True, exist_ok=True)
    np.save(out, order)
    print(f"[done] wrote: {out}")

if __name__ == "__main__":
    main()
