import pandas as pd
import numpy as np


data = "/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/data/luffy/valid.all.parquet"

df = pd.read_parquet(data)

# 按照process_valid_qwen3.py的方式处理prompt
final_data = []
other_data = []
for i in range(len(df)):
    row = df.iloc[i].copy()
    # 如果是列表或numpy数组，获取最后一个元素

    if row["data_source"] == "math":
        final_data.append(row)
    elif row["data_source"] == "olympiad_bench":
        final_data.append(row)
    else:
        other_data.append(row)
# breakpoint()
final_data = final_data * 4
final_data = final_data + other_data
final_data = pd.DataFrame(final_data)

# 保存为parquet文件
output_file = "/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/data/luffy/valid_more_all.parquet"
final_data.to_parquet(output_file)
print(f"\n数据已保存到: {output_file}")
print(f"总共 {len(final_data)} 条数据")
