import time
from openai import OpenAI
import random
import jsonlines
from tqdm import tqdm
from transformers import AutoTokenizer
import os

import sys

sys.path.append("../")
from bench.dataset.data_loading import load_test, load_articles, get_full_texts, get_titles

if __name__ == "__main__":
    model_name_official = "meta-llama/Llama-3.3-70B-Instruct"
    model_name_save = "llama33_70b_instruct"
    target_mode = "test_full"
    # sample_level = "64k"  # vllm 75776
    sample_level = "128k"  # vllm 131072
    # sample_level = "512k"  # vllm 131072
    # sample_level = "1024k"  # vllm 131072

    articles_all = load_articles(articles_folder="../bench/article/")

    save_name = f"../results_test/{sample_level}_{target_mode}_{model_name_save}.jsonl"
    if os.path.exists(save_name):
        samples_test = []
        with jsonlines.open(save_name) as reader:
            for line in reader:
                samples_test.append(line)
        print("existing results loaded", len(samples_test))
    else:
        samples_test = load_test(prefix=sample_level, samples_folder="../bench/dataset/samples/final/")
        print("original samples loaded", len(samples_test))

    openai_api_key = "EMPTY"
    openai_api_base = "http://localhost:8000/v1"
    client = OpenAI(
        api_key=openai_api_key,
        base_url=openai_api_base,
        )

    tokenizer = AutoTokenizer.from_pretrained(model_name_official)
    for sample_index, sample in tqdm(enumerate(samples_test), total=len(samples_test),
                                     desc=f"{model_name_save}_{sample_level}"):
        if "generations" not in sample.keys():
            question = sample["question"]
            markdowns = get_full_texts(sample, articles_all)
            context = "\n".join(markdowns)
            instruction = open("full_instruction.txt").read()
            instruction = instruction.replace("<question>", question)

            # truncate the input texts when model context is larger than the sample level
            if sample_level in ["128k", "512k", "1024k"]:
                model_max_window = 131072
                tokenized_instruction = tokenizer.encode(instruction)
                print("tokenized_instruction", len(tokenized_instruction))

                # the max generation size is set to the number of title tokens in all input articles
                titles = get_titles(sample, articles_all)
                tokenized_titles = tokenizer.encode(", ".join(titles))
                input_size = model_max_window - len(tokenized_titles) * 2 - len(tokenized_instruction)

                tokenized_context = tokenizer.encode(context)
                print("tokenized_context", len(tokenized_context))
                if len(tokenized_context) > input_size:
                    context = tokenizer.decode(tokenized_context[:input_size])

            prompt_content = instruction.replace("<articles>", context)
            while True:
                try:
                    output_dict = client.chat.completions.create(
                        model=model_name_official,
                        messages=[
                            {"role": "user",
                             "content": prompt_content}
                            ],
                        n=3
                        )
                    outputs = []
                    for choice in output_dict.choices:
                        outputs.append(choice.message.content)
                    print([sample["answer"]] + outputs)
                    break
                except Exception as e:
                    print(e)
                    if "limit" in str(e):
                        time.sleep(2)
            sample["generations"] = outputs

            samples_test[sample_index] = sample
            if sample_index % 10 == 0:
                with jsonlines.open(save_name, "w") as writer:
                    writer.write_all(samples_test)

    with jsonlines.open(save_name, "w") as writer:
        writer.write_all(samples_test)

