import pandas as pd
import json
import numpy as np

# 读取 Parquet 文件
file_path = 'AI_School_main_vllm/data/eval_datasets/MMLU_PRO/data/test-00000-of-00001.parquet'
df = pd.read_parquet(file_path)

# 假设题目、选项和答案分别存储在 'question'、'options' 和 'answer' 列
all_questions = []

for _, row in df.iterrows():
    question = row['question']
    options = row['options']  # 假设选项已是列表格式
    answer = row['answer']
    # 如果 options 是 NumPy 数组，则将其转换为列表
    if isinstance(options, np.ndarray):
        options = options.tolist()        
    # 构建新的题目结构
    question_entry = {
        'question': question,
        'options': options,
        'answer': answer
    }
    all_questions.append(question_entry)

# 将结果保存为新的 JSON 文件
output_file = 'AI_School_main_vllm/data/eval_datasets/MMLU_PRO/data/all_questions.json'
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(all_questions, file, ensure_ascii=False, indent=4)

print(f"提取了 {len(all_questions)} 道题目，并保存到 '{output_file}'")

# import json

# def convert_to_alpaca(input_file, output_file):
#     """
#     将对话格式JSON转换为Alpaca格式
#     :param input_file: 输入JSON文件路径
#     :param output_file: 输出JSON文件路径
#     """
#     with open(input_file, 'r', encoding='utf-8') as f:
#         original_data = json.load(f)
    
#     alpaca_data = []
#     for item in original_data:
#         # 提取human和gpt的对话内容
#         human_input = None
#         gpt_output = None
#         for turn in item['conversations']:
#             if turn['from'] == 'human':
#                 human_input = turn['value']
#             elif turn['from'] == 'gpt':
#                 gpt_output = turn['value']
        
#         # 构建Alpaca格式条目
#         if human_input and gpt_output:
#             alpaca_entry = {
#                 "instruction": "You are a general-purpose task assistant.",
#                 "input": human_input,
#                 "output": gpt_output
#             }
#             alpaca_data.append(alpaca_entry)
    
#     # 保存转换后的数据
#     with open(output_file, 'w', encoding='utf-8') as f:
#         json.dump(alpaca_data, f, ensure_ascii=False, indent=4)

# # 使用示例
# if __name__ == "__main__":
#     input_file = "AI_School_main_vllm/data/OpenHermes-2___5/openhermes2_5.json"    # 输入文件路径
#     output_file = "AI_School_main_vllm/data/OpenHermes-2___5/openhermes_alpaca.json"  # 输出文件路径
#     convert_to_alpaca(input_file, output_file)
#     print(f"转换完成，结果已保存到 {output_file}")