import os
import json
import re

# 文件夹路径
folder_path = '/Users/maxuyan/Documents/code/dify-main/output/APR/SWE-m-verified-6'
output_list = []

# 可自定义
MODEL_NAME_OR_PATH = "Chatgpt"

def extract_patch(answer_str):
    # 正则匹配 ```diff\n ... \n``` 之间的内容（不含首尾空行）
    match = re.search(r"```diff\n(.*?)\n```", answer_str, re.DOTALL)
    return match.group(1).strip() if match else ""

def extract_patch_from_json(patchs):
    # 正则匹配 ```json\n ... \n``` 之间的内容（不含首尾空行）
    match = re.search(r"```json\n(.*?)\n```", patchs, re.DOTALL)
    return match.group(1).strip() if match else ""

def process_file():
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                instance_id = data['id']
                patch_list = data["answer"]
                patchs = json.loads(patch_list)
                try:
                    patchs_clear = extract_patch_from_json(patchs['data']['outputs']['ranked_patches'])
                    patchs_clear = json.loads(patchs_clear)
                    patch = patchs_clear['ranked_patches'][0]['patch']
                    instance = {
                        "instance_id": instance_id,
                        "model_patch": patch,
                        "model_name_or_path": MODEL_NAME_OR_PATH
                    }
                    output_list.append(instance)
                except Exception as e:
                    print(filename)
                    print(e)
                # patch = 'diff' + patch
                

    # print(output_list)
    # 输出为result.json
    with open('predictions-ver-1.json', 'w', encoding='utf-8') as f:
        json.dump(output_list, f, ensure_ascii=False, indent=2)

    print(f"转换完成，已输出到 predictions-ver-3.json，共{len(output_list)}条。")




def process_file_2():
    for i in range(8):
        output_list = []
        for filename in os.listdir(folder_path):
            if filename.endswith('.json'):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    instance_id = data['id']
                    answer = data["answer"]
                    patch = json.loads(answer)
                    try:
                        # print(patch['data']['outputs']['final_patch'])
                        final_patch_list = patch['data']['outputs']['final_patch']
                        if i < len(final_patch_list):
                            instance = {
                            "instance_id": instance_id,
                            "model_patch": final_patch_list[i],
                            "model_name_or_path": MODEL_NAME_OR_PATH
                            }
                            output_list.append(instance)
                    except Exception as e:
                        print(e)
                        # print(filename)
                        # print(patch['data']['outputs']['final_patch'])
                    
                        # print(filename)
                        # print(e)
                

        with open(f'predictions-ver-new-{i}.json', 'w', encoding='utf-8') as f:
            json.dump(output_list, f, ensure_ascii=False, indent=2)

        print(f"转换完成，已输出到 predictions-ver-new-{i}.json，共{len(output_list)}条。")

process_file_2()
                

# import pandas as pd
# import os,json

# def get_swe_dataset():
#     """
#     读取parquet文件并返回包含question和final_answer的字典数组
    
#     Returns:
#         list: 包含字典的列表，每个字典包含'question'和'answer'字段
#     """
#     try:
#         # 文件路径
#         file_path = '/Users/maxuyan/Documents/code/dify-main/templates/dataset/swe-bench-lite/test-00000-of-00001.parquet'
        
#         # 检查文件是否存在
#         if not os.path.exists(file_path):
#             print(f"文件 {file_path} 不存在")
#             return []
        
#         # 读取parquet文件
#         df = pd.read_parquet(file_path)
        
#         # 提取所需数据并转换为字典数组
#         swe_dataset = []
#         for _, row in df.iterrows():
#             instance_id = row['instance_id']
#             question = row['problem_statement']
#             repo = row['repo']
#             swe_dataset.append({
#                 'id': instance_id,
#                 'repo': repo,
#                 'question': question,
#                 'patch': row['patch']
#             })
#         return swe_dataset
        
#     except Exception as e:
#         print(f"读取文件时出错: {str(e)}")
#         return []

# swe_dataset = get_swe_dataset()
# for i, item in enumerate(swe_dataset):
#     # print(f"\n样本 {i+1}:")
#     # print(f"id: {item['id']}")
#     # print(f"问题: {item['question']}")
#     # print(f"repo: {item['repo']}")
#     # print("-" * 50)

#     dict_list = []
#     for i, item in enumerate(swe_dataset):
#         dict_list.append({
#             'instance_id': item['id'],
#             'model_patch': item['patch'],
#             'model_name_or_path': 'chatgpt'
#         })
    
#     # 将dict_list转换为json文件
#     with open('predictions.json', 'w', encoding='utf-8') as f:
#         json.dump(dict_list, f, ensure_ascii=False, indent=2)

#     print(f"转换完成，已输出到 predictions.json，共{len(dict_list)}条。")
