import sys
import json
import jsonlines
from tqdm import tqdm

project_path = "meta-researcher"
sys.path.insert(0, project_path)

input_file = "../data/qa_pair/Orion_Zh_En_310.jsonl"
mid_file = "../data/qa_pair/Orion_Zh_En_310_mid_base.jsonl"
output_file = "../data/train_data/Toolcall_Orion_Zh_En_base.jsonl"
pattern = "train"

instruction = """
Answer the given question. You can use the tools provided to you to answer the question. You can use the tool as many times as you want.
You must first conduct reasoning inside <think>...</think>. If you need to use the tool, you can use the tool call <tool_call>...</tool_call> to call the tool after <think>...</think>.
When you have the final answer, you can output the answer inside <answer>...</answer>.

Output format for tool call:
<think>
...
</think>
<tool_call>
...
</tool_call>

Output format for answer:
<think>
...
</think>
<answer>
...
</answer>

Question: {Input}
"""

def read_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line for line in jsonlines.Reader(f)]


def save_to_jsonl(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')


def build_output_data(mid_dataset, pattern="train"):
    output_data = []
    for idx, data in enumerate(tqdm(mid_dataset, desc="Building Output Data")):
        question = data.get("src", "")
        answer = data.get("tgt", "")
        supporting_facts = data.get("supporting_facts", [])
        
        golden_answers = answer if isinstance(answer, list) else [answer]

        tool_call_format = "{'name': 'Tool Name', 'arguments': { ... }}"
        max_step_num=4
        prompt_content = instruction.format(Input=question)

        data_source = "closed_question"
        new_item = {
            "id": f"{pattern}_{idx}",
            "question": question,
            "golden_answers": json.dumps(golden_answers, ensure_ascii=False),
            "supporting_facts": json.dumps(golden_answers, ensure_ascii=False),
            "data_source": data_source,
            "prompt": [
                {
                    "content": prompt_content,
                    "role": "user"
                }
            ],
            "ability": "closed_question",
            "reward_model": {
                "ground_truth": json.dumps(golden_answers, ensure_ascii=False) if golden_answers else "",
                "style": "rule"
            },
            "extra_info": {
                "answer": json.dumps(golden_answers, ensure_ascii=False) if golden_answers else "",
                "index": str(idx),
                "question": question,
                "split": pattern,
                "supporting_facts": json.dumps(supporting_facts, ensure_ascii=False)
            }
        }

        output_data.append(new_item)
    return output_data


if __name__ == "__main__":
    # Step 1: Read input JSONL and convert to mid format
    origin_dataset = read_jsonl(input_file)

    mid_dataset = []
    for one_data in origin_dataset:
        one_data["src"] = one_data.get("question", one_data.get("question", ""))
        one_data["tgt"] = one_data.get("answer", one_data.get("answer", ""))
        mid_dataset.append(one_data)

    # Step 2: Save mid format to mid_file (optional)
    save_to_jsonl(mid_dataset, mid_file)

    # Step 3: Build and save final structured format
    output_dataset = build_output_data(mid_dataset, pattern=pattern)
    save_to_jsonl(output_dataset, output_file)

    print(f"Processed {len(output_dataset)} items and saved to {output_file}")