from openai import OpenAI
from collect import write_jsonl, read_json, append_jsonl
import os


def split_error_files(meta_path, output_successful_path, output_failed_path):
    meta_data_list = read_json(meta_path)
    successful_data_list = []
    failed_data_list = []
    for _meta_data in meta_data_list:
        if _meta_data["has_answer"] in [1, "1"]:
            successful_data_list.append(_meta_data)
        else:
            failed_data_list.append(_meta_data)
    write_jsonl(successful_data_list, output_successful_path)
    write_jsonl(failed_data_list, output_failed_path)


def dash_request(request_data):
    # 初始化OpenAI客户端
    client = OpenAI(
        # 如果没有配置环境变量，请用阿里云百炼API Key替换：api_key="sk-xxx"
        api_key="sk-32aa96e3f60f46e797c99286a27343ee",
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
    )

    reasoning_content = ""  # 定义完整思考过程
    answer_content = ""  # 定义完整回复
    is_answering = True  # 判断是否结束思考过程并开始回复

    # 创建聊天完成请求
    completion = client.chat.completions.create(
        # 此处以qwen-plus-2025-04-28为例，可更换为其它支持联网搜索的深度思考模型
        model="qwen-plus-2025-04-28",
        messages=[{"role": "user", "content": request_data}],
        extra_body={
            # 开启深度思考的参数，对 QwQ 模型无效
            "enable_thinking": False,
            "enable_search": True,  # 开启联网搜索的参数
            "search_options": {
                "forced_search": False,  # 强制联网搜索的参数
                "search_strategy": "pro",  # 模型将搜索10条互联网信息
            },
        },
        # QwQ 模型仅支持流式输出方式调用
        stream=True,
        # 解除以下注释会在最后一个chunk返回Token使用量
        # stream_options={"include_usage": True},
    )

    for chunk in completion:
        # 如果chunk.choices为空，则打印usage
        if not chunk.choices:
            print("\n" + "=" * 20 + "Usage" + "=" * 20)
            print(chunk.usage)
        else:
            delta = chunk.choices[0].delta
            # 打印思考过程
            if hasattr(delta, "reasoning_content") and delta.reasoning_content != None:
                print(delta.reasoning_content, end="", flush=True)
                reasoning_content += delta.reasoning_content
            else:
                # 开始回复
                if delta.content != "" and is_answering is False:
                    is_answering = True
                # 打印回复过程
                answer_content += delta.content

    return reasoning_content, answer_content


def crash_dataset(meta_path, output_path):
    def _get_response_json(_response_content):
        import re
        json_match = re.search(r'```json\n(.*?)\n```', _response_content, re.DOTALL)
        print(f"_response_content = {_response_content}")
        print(f"json_match = {json_match}")
        json_str = json_match.group(1)
        return eval(json_str)

    prompt = """
        {meta_data_list}
        请你检查列表中每一项 question 对应的 reference 是否正确，如果正确则不改变其答案；如果不正确请修改 reference 中的内容。
        请注意，不要更改列表的格式，并且 question 和 reference 均是英文形式给出；并且你需要以json格式返回，即```json```。
    """
    meta_data_list = read_json(meta_path)
    for start in range(5090, len(meta_data_list), 10):
        end = min(start + 10, len(meta_data_list))
        batch_meta_data_list = [{"question": i["question"], "reference": i["reference"]} for i in meta_data_list[start:end]]
        request_content = prompt.format(meta_data_list=batch_meta_data_list)
        _, response_content = dash_request(request_content)
        crash_data_list = _get_response_json(response_content)
        append_jsonl(crash_data_list, output_path)
        print(f"Finished range {start} to {end}")


if __name__ == "__main__":
    meta_data = """
        [{"question": "where did they film hot tub time machine", "reference": ["Fernie Alpine Resort"]}
        {"question": "who has the right of way in international waters", "reference": ["Neither vessel"]}
        {"question": "who does annie work for attack on titan", "reference": ["Marley"]}
        {"question": "when was the immigration reform and control act passed", "reference": ["November\u00a06, 1986"]}
        {"question": "when was puerto rico added to the usa", "reference": ["1950"]}
        {"question": "who has been chosen for best supporting actress in 64 national filmfare award", "reference": ["Zaira Wasim"]}
        {"question": "which side of the white house is the front", "reference": ["North"]}
        {"question": "names of the metropolitan municipalities in south africa", "reference": ["Mangaung Metropolitan Municipality", "Nelson Mandela Bay Metropolitan Municipality", "eThekwini Metropolitan Municipality", "City of Tshwane Metropolitan Municipality", "City of Johannesburg Metropolitan Municipality", "Buffalo City Metropolitan Municipality", "City of Ekurhuleni Metropolitan Municipality"]}
        {"question": "who's hosting the super bowl in 2019", "reference": ["Atlanta, Georgia"]}
        {"question": "in which year vivo launch its first phone in india", "reference": ["2014"]}
        {"question": "where does it talk about mary magdalene in the bible", "reference": ["New Testament"]}
        {"question": "who carries the nuclear football for the president", "reference": ["aide-de-camp"]}
        {"question": "what is the origin of the name cynthia", "reference": ["Greek"]}]

        请你检查列表中每一项 question 对应的 reference 是否正确，如果正确则不改变其答案；如果不正确请修改 reference 中的内容。请注意，不要更改列表的格式，并且 question 和 reference 均是英文形式给出。
    """
    reasoning_content, answer_content = dash_request(meta_data)
    print(reasoning_content)
    print("----------------------------------")
    print(answer_content)
