#!/usr/bin/env python
import argparse
from datasets import Dataset
from utils import load_single_dataset, save_dataset

def main():
    parser = argparse.ArgumentParser(description="处理 HuggingFace DatasetDict：新增原始索引列并用固定种子打乱。")
    parser.add_argument("--input_path", type=str, help="输入数据集路径（load_from_disk 的路径）")
    parser.add_argument("--output_path", type=str, help="保存数据集路径（save_to_disk 的目标路径）")
    args = parser.parse_args()

    # 读取已有的数据集
    ds: Dataset = load_single_dataset(args.input_path, dataset_split="train")

    # 新增原始索引列
    ds = ds.filter(lambda row: row["ability"] == "math", num_proc=64)
    ds = ds.shuffle(seed=42)

    # 保存处理后的数据集
    save_dataset(ds, args.output_path)
    print(f"处理完成，数据已保存到 {args.output_path}")

if __name__ == "__main__":
    main()

"""
~/verl_cs/.conda/bin/python ~/verl_cs/scripts/out_code_task.py \
    --input_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part1.json \
    --output_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part1_onlymath.json 
~/verl_cs/.conda/bin/python ~/verl_cs/scripts/out_code_task.py \
    --input_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part2.json \
    --output_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part2_onlymath.json 
~/verl_cs/.conda/bin/python ~/verl_cs/scripts/out_code_task.py \
    --input_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part3.json \
    --output_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part3_onlymath.json 
~/verl_cs/.conda/bin/python ~/verl_cs/scripts/out_code_task.py \
    --input_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part4.json \
    --output_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part4_onlymath.json 
~/verl_cs/.conda/bin/python ~/verl_cs/scripts/out_code_task.py \
    --input_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part5.json \
    --output_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part5_onlymath.json 
~/verl_cs/.conda/bin/python ~/verl_cs/scripts/out_code_task.py \
    --input_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part6.json \
    --output_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part6_onlymath.json 
~/verl_cs/.conda/bin/python ~/verl_cs/scripts/out_code_task.py \
    --input_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part7.json \
    --output_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part7_onlymath.json 
~/verl_cs/.conda/bin/python ~/verl_cs/scripts/out_code_task.py \
    --input_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part8.json \
    --output_path ~/LLaMA-Factory-250514/saves_shuyan/qwen3-0.6B-base/prime-sft/prime-rl-rollouts/part8_onlymath.json 


"""