from datetime import datetime
import sys
import json
import jsonlines
from tqdm import tqdm

project_path = "meta-researcher"
sys.path.insert(0, project_path)

input_file = "../data/experiments/GAIA/dev.jsonl"
mid_file = "../data/experiments/GAIA/dev_mid_base.jsonl"
output_file = "../data/test_data/Toolcall_dev_GAIA_base.jsonl"
pattern = "validation"

instruction = """
Answer the given question. You can use the tools provided to you to answer the question. You can use the tool as many times as you want.
You must first conduct reasoning inside <think>...</think>. If you need to use the tool, you can use the tool call <tool_call>...</tool_call> to call the tool after <think>...</think>.
When you have the final answer, you can output the answer inside <answer>...</answer>.

Output format for tool call:
<think>
...
</think>
<tool_call>
...
</tool_call>

Output format for answer:
<think>
...
</think>
<answer>
...
</answer>

Question: {Input}
"""


def read_jsonl(file_path):
    """读取 JSONL 文件"""
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line for line in jsonlines.Reader(f)]


def save_to_jsonl(data, file_path):
    """写入 JSONL 文件"""
    with open(file_path, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')


def build_output_data(mid_dataset, pattern="train"):
    output_data = []
    for idx, data in enumerate(tqdm(mid_dataset, desc="Building Output Data")):
        question = data.get("src", "")
        answer = data.get("tgt", "")
        supporting_facts = data.get("supporting_facts", [])
        
        golden_answers = answer if isinstance(answer, list) else [answer]

        CURRENT_TIME = datetime.now().strftime("%a %b %d %Y %H:%M:%S %z")
        tool_call_format = "{'name': <function-name>, 'arguments': <args-json-object>}"
        max_step_num=4
        prompt_content = instruction.format(Input=question)
        
        data_source = "closed_question"
        new_item = {
            "id": f"{pattern}_{idx}",
            "question": question,
            "golden_answers": json.dumps(golden_answers, ensure_ascii=False),
            "supporting_facts": json.dumps(supporting_facts, ensure_ascii=False),
            "level": "",
            "data_source": data_source,
            "prompt": [
                {
                    "content": prompt_content,
                    "role": "user"
                }
            ],
            "ability": "closed_question",
            "reward_model": {
                "ground_truth": json.dumps(golden_answers, ensure_ascii=False) if golden_answers else "",
                "style": "rule"
            },
            "extra_info": {
                "answer": golden_answers[0] if golden_answers else "",
                "index": str(idx),
                "question": question,
                "split": pattern,
                "supporting_facts": json.dumps(supporting_facts, ensure_ascii=False)
            }
        }

        output_data.append(new_item)
    return output_data


if __name__ == "__main__":
    origin_dataset = read_jsonl(input_file)

    mid_dataset = []
    for one_data in origin_dataset:
        one_data["src"] = one_data.get("Question", one_data.get("question", ""))
        one_data["tgt"] = one_data.get("answer", one_data.get("answer", ""))
        mid_dataset.append(one_data)
    save_to_jsonl(mid_dataset, mid_file)
    output_dataset = build_output_data(mid_dataset, pattern=pattern)
    save_to_jsonl(output_dataset, output_file)

    print(f"Processed {len(output_dataset)} items and saved to {output_file}")