import json
from tqdm import tqdm

from system_prompt import get_compression_prompt


def load_jsonl(in_file):
    datas = []
    with open(in_file, "r", encoding="utf-8") as f:
        for line in tqdm(f):
            datas.append(json.loads(line))
    return datas


def save_jsonl(datas, out_file):
    with open(out_file, "w", encoding="utf-8") as f:
        for data in tqdm(datas):
            f.write(json.dumps(data, ensure_ascii=False) + "\n")


def convert_to_llama_factory_format(in_file, out_file):
    datas = load_jsonl(in_file)
    new_datas = []

    for data in tqdm(datas):
        if data["kind"] == "uncompressed_messages":
            continue
        messages = data["messages"]
        new_messages = []
        compressed_text = None
        for message in messages:
            if message["role"] == "assistant":
                if "content" in message:
                    new_messages.append({"role": "assistant", "content": message["content"]})
                if "tool_calls" in message and len(message["tool_calls"]) > 0:
                    assert len(message["tool_calls"]) == 1, f"more than one tool call in a user message: {message}"
                    new_messages.append({"role": "function_call", "content": json.dumps(message["tool_calls"][0]["function"])})
            elif message["role"] == "tool":
                new_messages.append({"role": "observation", "content": message["content"]})
            elif message["role"] == "system":
                new_messages.append({"role": "system", "content": message["content"]})
            elif message["role"] == "user":
                new_messages.append({"role": "user", "content": message["content"]})

            if message["role"] == "system" and message["content"].startswith("<COMPRESSED_HISTORY>"):
                compressed_text = message["content"]

        new_data = {
            "messages": new_messages,
            "tools": json.dumps(data["tools"]),
        }
        new_datas.append(new_data)

        if compressed_text is not None and len(data["compressed_section"]) > 0:
            compression_prompt = get_compression_prompt()
            compression_messages = [
                {"role": "system", "content": compression_prompt},
                {"role": "user", "content": json.dumps(data["compressed_section"])},
                {"role": "assistant", "content": compressed_text}
            ]
            new_data = {
                "messages": compression_messages,
                "tools": "[]",
            }
            new_datas.append(new_data)

    save_jsonl(new_datas, out_file)


if __name__ == "__main__":
    in_file = "src/run_process_data/jsonl_files/nextjs_github-repos_filtered-with-info_backtranslated.jsonl"
    out_file = "/root/user/code_agent/LLaMA-Factory-FullStack-Agent/data/nextjs_github-repos_filtered-with-info_backtranslated_sharegpt.jsonl"
    convert_to_llama_factory_format(in_file, out_file)
