import pandas as pd
import json
import os
from typing import Dict, List, Any


def load_jsonl(file_path: str) -> List[Dict]:
    """加载JSONL文件"""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

def load_parquet(file_path: str) -> pd.DataFrame:
    """加载parquet文件"""
    return pd.read_parquet(file_path)

def process_data(parquet_file: str, output_file: str):
    """
    处理deepscaler.parquet和测试结果jsonl文件，生成新的parquet文件
    
    Args:
        parquet_file: 原始deepscaler.parquet文件路径
        jsonl_file: 测试结果jsonl文件路径  
        output_file: 输出parquet文件路径
    """
    # 加载数据
    print("正在加载parquet文件...")
    df_parquet = load_parquet(parquet_file)
    print(f"Parquet文件包含 {len(df_parquet)} 行数据")
    
    
    # 处理数据
    processed_data = []
   
    
    for idx, row in df_parquet.iterrows():
        # 获取原始数据
        data_source = row.get('data_source', '')
        prompt = row.get('prompt', [])
        ability = row.get('ability', 'math')
        reward_model = row.get('reward_model', {})
        extra_info = row.get('extra_info', {})
        
        # 查找对应的测试结果
        
        new_prompt = [
            {'role': 'system', 'content': 'Your task is to follow a systematic, thorough reasoning process before providing the final solution. This involves analyzing, summarizing, exploring, reassessing, and refining your thought process through multiple iterations. Structure your response into two sections: Thought and Solution. In the Thought section, present your reasoning using the format: "<think>\n {thoughts} </think>\n". Each thought should include detailed analysis, brainstorming, verification, and refinement of ideas. After "</think>\n," in the Solution section, provide the final, logical, and accurate answer, clearly derived from the exploration in the Thought section. If applicable, include the answer in \\boxed{} for closed-form results like multiple choices or mathematical solutions.'},
            prompt[0]
        ]
        
        # 构建新的数据结构
        new_row = {
            'data_source': data_source,
            'prompt': new_prompt,
            'ability': ability,
            'reward_model': reward_model,
            'extra_info': extra_info
        }
        
        processed_data.append(new_row)
    
    # 创建新的DataFrame
    new_df = pd.DataFrame(processed_data)
    print(new_df.iloc[0]['prompt'])
    # 保存为parquet文件
    print(f"\n正在保存到 {output_file}...")
    new_df.to_parquet(output_file, index=False)
    print(f"成功保存 {len(new_df)} 行数据到 {output_file}")

if __name__ == "__main__":
    # 文件路径
    parquet_file = "/mnt/shared-storage-user/p1-shared/wangfuting/codes/rl/LUFFY/data/l1/deepscaler.parquet"
   
    output_file = "/mnt/shared-storage-user/p1-shared/wangfuting/codes/rl/LUFFY/data/l1/deepscaler_base_luffy_style.parquet"
    
    # 检查文件是否存在
    if not os.path.exists(parquet_file):
        print(f"错误：找不到parquet文件 {parquet_file}")
        exit(1)
    
  
    
    # 处理数据
    process_data(parquet_file, output_file)
