import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# the generated performance of 3p is too low for both llama and chatGPT, generate too bad. Give 3hop up.
# should write different prompt for different question
def call_LLM(llm_model, tokenizer, prompts, few_shot_prompt, no_prompt_head=False):
    new_prompts = []
    for p in prompts:
        prompt_template = few_shot_prompt + p + "\nOutput: "
        new_prompts.append(prompt_template)
    
    inputs = tokenizer(new_prompts, return_tensors="pt", padding=True, truncation=True).to(llm_model.device)
    with torch.no_grad():
        outputs = llm_model.generate(**inputs, max_new_tokens=1024)
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

    res = []
    for i, text in enumerate(generated_texts):
        if no_prompt_head:
            start = len(new_prompts[i])
            text = text[start:]
        res.append(text)

    return res




def get_model(model_name, access_token, output_hidden_states=False):
    model_name2model_path = {
        "Llama-2-7b" : "meta-llama/Llama-2-7b-chat-hf",
        "Llama-2-13b" : "meta-llama/Llama-2-13b-chat-hf",
        "Llama-2-70B-Instruct" : "meta-llama/Meta-Llama-2-70B-Instruct"
    }

    tokenizer = AutoTokenizer.from_pretrained(model_name2model_path[model_name], token=access_token)
    model = AutoModelForCausalLM.from_pretrained(model_name2model_path[model_name], device_map="auto", token=access_token, output_hidden_states=output_hidden_states)

    print(f"memory usage: {torch.cuda.memory_allocated()/1000/1000/1000} GB")
    print(next(model.parameters()).is_cuda)


    # add this two lines because padding=True
    if tokenizer.pad_token is None:
        #tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        tokenizer.pad_token = tokenizer.eos_token # to avoid an error, otherwise, the code will see error "out of index range"
    
    return model, tokenizer
