from vllm import LLM, SamplingParams
import jsonlines
from tqdm import tqdm
from transformers import AutoTokenizer
import os
import sys

sys.path.append("../../")
from bench.dataset.data_loading import load_test, load_articles, get_full_texts, get_titles


if __name__ == "__main__":
    dataset_dir = "../../bench"
    model_name_official = "Qwen/Qwen2.5-7B-Instruct-1M" # the fine-tuned model
    target_mode = "test_full"
    vllm_tensor_parallel_size = 4
    sample_level = "64k"
    vllm_max_model_length = 4096
    # sample_level = "128k"
    # vllm_max_model_length = 141312
    # sample_level = "512k"
    # vllm_max_model_length = 534528
    # sample_level = "1024k"
    # vllm_max_model_length = 1010000

    llm = LLM(model=model_name_official, max_model_len=vllm_max_model_length, tensor_parallel_size=vllm_tensor_parallel_size)
    #  max_tokens is for the maximum length for generation.
    sampling_params = SamplingParams(n=3, temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=10240)

    articles_all = load_articles(articles_folder=dataset_dir + "/article/")

    samples_test = load_test(prefix=sample_level, samples_folder=dataset_dir + "/dataset/samples/final/")
    print("original samples loaded", len(samples_test))

    tokenizer = AutoTokenizer.from_pretrained(model_name_official)
    print(tokenizer.chat_template)
    for sample_index, sample in tqdm(enumerate(samples_test), total=len(samples_test),
                                     desc=f"{sample_level}"):
        if "generations" not in sample.keys():
            question = sample["question"]
            markdowns = get_full_texts(sample, articles_all)
            context = "\n".join(markdowns)
            instruction = open(dataset_dir + "/../test_full/full_instruction.txt").read()
            instruction = instruction.replace("<question>", question)

            # truncate the input texts when model context is larger than the sample level
            if sample_level == "1024k":
                model_max_window = 1010000
                tokenized_instruction = tokenizer.encode(instruction)
                print("tokenized_instruction", len(tokenized_instruction))

                # the max generation size is set to the number of title tokens in all input articles
                titles = get_titles(sample, articles_all)
                tokenized_titles = tokenizer.encode(", ".join(titles))
                input_size = model_max_window - len(tokenized_titles) * 2 - len(tokenized_instruction)

                tokenized_context = tokenizer.encode(context)
                print("tokenized_context", len(tokenized_context))
                if len(tokenized_context) > input_size:
                    context = tokenizer.decode(tokenized_context[:input_size])
            prompt_content = instruction.replace("<articles>", context)

            conversation = [{"role": "user", "content": prompt_content}]
            text = tokenizer.apply_chat_template(
                conversation,
                tokenize=False,
                add_generation_prompt=True
                )
            conversation_outputs = llm.generate([text], sampling_params, use_tqdm=False)
            # print(conversation_outputs)
            generations = []
            for conversation_output in conversation_outputs:
                for tmp in conversation_output.outputs:
                    generations.append(tmp.text)

            sample["generations"] = generations
            print([sample["answer"]] + sample["generations"])
            samples_test[sample_index] = sample