# Licensed under the MIT license.

from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

import torch
num_gpus = torch.cuda.device_count()
devices = [f"cuda:{i}" for i in range(num_gpus)]

def load_vLLM_model(model_ckpt,tensor_parallel_size=1, half_precision=False, max_num_seqs=256):
    
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

    llm = LLM(
            model=model_ckpt,
            tensor_parallel_size=num_gpus  # 指定张量并行的数量
        )

    return tokenizer, llm


def generate_with_Qwen_vLLM_model(
    model,
    tokenizer,
    input,
    temperature=0.7,
    top_p=0.8,
    top_k=40,
    repetition_penalty=1.05,
    n=1,
    max_tokens=512,

    stop=[],
):

    sampling_params = SamplingParams(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        n=n,
        # logprobs=logprobs,
        max_tokens=max_tokens,
        stop=stop,
    )
    if isinstance(input,list):
        
        text = []
        for i in input:
            messages= [
                {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
                {"role": "user", "content": i}
            ]
            text.append( tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True
                )
                )
    elif isinstance(input,str):
        messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": input}
        ]
        text = [tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )]
    
    outputs = model.generate(text, sampling_params,use_tqdm=False)

    return outputs



